LCOV - coverage_filtered.info - nntrainer/tensor/float

LCOV - code coverage report

Current view:	top level - nntrainer/tensor - float_tensor.cpp (source / functions)		Coverage	Total	Hit
Test:	coverage_filtered.info	Lines:	76.6 %	738	565
Test Date:	2025-12-14 20:38:17	Functions:	90.5 %	74	67

            Line data    Source code

       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * @file        float_tensor.cpp
       4              :  * @date        01 December 2023
       5              :  * @brief       This is FloatTensor class for 32-bit floating point calculation
       6              :  * @see         https://github.com/nnstreamer/nntrainer
       7              :  * @author      Jijoong Moon <jijoong.moon@samsung.com>
       8              :  * @author      Donghyeon Jeong <dhyeon.jeong@samsung.com>
       9              :  * @bug         No known bugs except for NYI items
      10              :  */
      11              : 
      12              : #include <iomanip>
      13              : #include <iostream>
      14              : #include <numeric>
      15              : 
      16              : #include <chrono>
      17              : #include <cpu_backend.h>
      18              : #include <float_tensor.h>
      19              : #include <int4_tensor.h>
      20              : #include <q4_0_utils.h>
      21              : 
      22              : #include <tensor.h>
      23              : #include <util_func.h>
      24              : 
      25              : #ifdef ENABLE_OPENCL
      26              : #include "blas_kernels.h"
      27              : #endif
      28              : 
      29              : namespace nntrainer {
      30              : 
      31       282558 : FloatTensor::FloatTensor(std::string name_, Tformat fm) :
      32       565116 :   TensorBase(name_, fm, Tdatatype::FP32) {}
      33              : 
      34       375559 : FloatTensor::FloatTensor(const TensorDim &d, bool alloc_now, Initializer init,
      35       375559 :                          std::string name) :
      36       375559 :   TensorBase(d, alloc_now, init, name) {
      37       375559 :   if (alloc_now)
      38       341388 :     allocate();
      39       375559 : }
      40              : 
      41       335977 : FloatTensor::FloatTensor(const TensorDim &d, const void *buf) :
      42       335977 :   FloatTensor(d, true) {
      43       335977 :   if (d.getDataLen() != 0) {
      44       335977 :     if (buf != nullptr)
      45        13863 :       copy(buf);
      46              :   }
      47       335977 : }
      48              : 
      49        10794 : bool FloatTensor::operator==(const FloatTensor &rhs) const {
      50        10794 :   const float *_data = (float *)getData();
      51        10794 :   const float *_rdata = (float *)rhs.getData();
      52      1403019 :   for (size_t i = 0; i < size(); ++i) {
      53              :     /** not checking sign change is intentional to avoid float calculation
      54              :      * errors around 0 */
      55      1392265 :     if (std::isnan(_data[i]) || std::isnan(_rdata[i]) ||
      56      1392264 :         std::fabs(_data[i] - _rdata[i]) > epsilon)
      57              :       return false;
      58              :   }
      59              : 
      60              :   return true;
      61              : }
      62              : 
      63              : /// @todo support allocation by src_tensor
      64       643255 : void FloatTensor::allocate() {
      65       643255 :   if (empty() || data)
      66              :     return;
      67              : 
      68       643245 :   if (src_tensor) {
      69              :     /// allocate data based on the source tensor
      70       301850 :     allocateSrcTensor();
      71              :     /** as this memory is shared, do NOT initialize */
      72              :   } else {
      73              :     /// allocate new memory for the tensor data
      74              :     MemoryData *mem_data;
      75              : 
      76   4522423516 :     mem_data = new MemoryData((void *)(new float[dim.getDataLen()]{}));
      77       341395 :     data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
      78       341395 :       delete[] mem_data->template getAddr<float>();
      79       341395 :       delete mem_data;
      80              :     });
      81              : 
      82       341395 :     offset = 0;
      83       341395 :     initialize();
      84              :   }
      85              : }
      86              : 
      87          906 : void FloatTensor::deallocate() {
      88              :   data = nullptr;
      89          906 :   offset = 0;
      90          906 : }
      91              : 
      92    136172122 : void *FloatTensor::getData() const {
      93    136172122 :   if (!data)
      94              :     return nullptr;
      95              : 
      96              :   data->validate();
      97    136170503 :   return data->getAddr<float>() + offset;
      98              : }
      99              : 
     100            0 : void *FloatTensor::getData(size_t idx) const {
     101            0 :   if (!data)
     102              :     return nullptr;
     103              : 
     104              :   data->validate();
     105            0 :   return data->getAddr<float>() + offset + idx;
     106              : }
     107              : 
     108      5427307 : void *FloatTensor::getAddress(unsigned int i) {
     109      5427307 :   size_t index = getIndex(batch(), channel(), height(), width());
     110      5427307 :   if (i > index) {
     111              :     return nullptr;
     112              :   }
     113      5427307 :   return &((float *)getData())[i];
     114              : }
     115              : 
     116       148414 : const void *FloatTensor::getAddress(unsigned int i) const {
     117       148414 :   size_t index = getIndex(batch(), channel(), height(), width());
     118       148414 :   if (i > index) {
     119              :     return nullptr;
     120              :   }
     121       148414 :   return &((float *)getData())[i];
     122              : }
     123              : 
     124         4584 : const float &FloatTensor::getValue(unsigned int i) const {
     125         4584 :   return ((float *)getData())[i];
     126              : }
     127              : 
     128         1291 : float &FloatTensor::getValue(unsigned int i) { return ((float *)getData())[i]; }
     129              : 
     130         4584 : const float &FloatTensor::getValue(unsigned int b, unsigned int c,
     131              :                                    unsigned int h, unsigned int w) const {
     132         4584 :   return getValue(getIndex(b, c, h, w));
     133              : }
     134              : 
     135         1260 : float &FloatTensor::getValue(unsigned int b, unsigned int c, unsigned int h,
     136              :                              unsigned int w) {
     137         1260 :   return getValue(getIndex(b, c, h, w));
     138              : }
     139              : 
     140        81317 : void FloatTensor::setValue(float value) {
     141        81317 :   float *data = (float *)getData();
     142        81317 :   std::fill(data, data + size(), value);
     143        81317 : }
     144              : 
     145     35425636 : void FloatTensor::setValue(unsigned int b, unsigned int c, unsigned int h,
     146              :                            unsigned int w, float value) {
     147     35425636 :   ((float *)getData())[getIndex(b, c, h, w)] = value;
     148     35425636 : }
     149              : 
     150        10913 : void FloatTensor::addValue(unsigned int b, unsigned int c, unsigned int h,
     151              :                            unsigned int w, float value, float beta) {
     152        10913 :   auto const &idx = getIndex(b, c, h, w);
     153        10913 :   ((float *)getData())[idx] *= beta;
     154        10913 :   ((float *)getData())[idx] += value;
     155        10913 : }
     156              : 
     157        42378 : void FloatTensor::setZero() {
     158        42378 :   if (contiguous) {
     159              :     //  sscal(size(), 0, getData<float>(), 1);
     160              :     /// @note we cannot use sscal, when we set zero. if the data is inf or
     161              :     /// NaN, then the inf or NaN still remain.
     162        42378 :     memset((float *)getData(), 0, sizeof(float) * size());
     163              :   } else {
     164              :     /// @todo implement apply_i
     165              :     // apply_i<float>([](float val) -> float { return 0; });
     166            0 :     setValue(0);
     167              :   }
     168        42378 : }
     169              : 
     170          110 : void FloatTensor::setRandNormal(float mean, float stddev) {
     171          110 :   setDist<std::normal_distribution<float>>(
     172              :     std::normal_distribution<float>(mean, stddev));
     173          110 : }
     174              : 
     175        18101 : void FloatTensor::setRandUniform(float min, float max) {
     176        18101 :   setDist<std::uniform_real_distribution<float>>(
     177              :     std::uniform_real_distribution<float>(min, max));
     178        18101 : }
     179              : 
     180            3 : void FloatTensor::setRandBernoulli(float probability) {
     181            3 :   setDist<std::bernoulli_distribution>(
     182              :     std::bernoulli_distribution(probability));
     183            3 : }
     184              : 
     185       360584 : void FloatTensor::initialize() {
     186       360584 :   if (empty() || !isAllocated())
     187              :     return;
     188              : 
     189              :   unsigned int fan_in, fan_out;
     190              : 
     191              :   /// @fixme: when unit is equal to one, this does not work, we need to rely on
     192              :   /// effective dimension then actual numbers here. For now, some heuristics
     193              :   /// added to infer what would be fan_in/fan_out
     194       360583 :   if (dim.batch() * dim.channel() * dim.height() == 1) {
     195       233468 :     fan_out = fan_in = dim.width();
     196       127115 :   } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
     197        12222 :     fan_in = dim.height();
     198        12222 :     fan_out = dim.width();
     199              :   } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
     200       114893 :     auto field_size = dim.height() * dim.width();
     201              : 
     202              :     // this also handles below cases.
     203              :     // 1. fan_in = fan_out = 1 as well.
     204              :     // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
     205       114893 :     fan_in = dim.channel() * field_size;
     206       114893 :     fan_out = dim.batch() * field_size;
     207              :   }
     208              : 
     209       360583 :   switch (initializer) {
     210         5666 :   case Initializer::ZEROS:
     211         5666 :     setZero();
     212         5666 :     break;
     213          242 :   case Initializer::ONES:
     214          242 :     setValue(1.0f);
     215          242 :     break;
     216            0 :   case Initializer::LECUN_NORMAL:
     217            0 :     setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
     218            0 :     break;
     219            0 :   case Initializer::XAVIER_NORMAL:
     220            0 :     setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
     221            0 :     break;
     222            2 :   case Initializer::HE_NORMAL:
     223            2 :     setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
     224            2 :     break;
     225            0 :   case Initializer::LECUN_UNIFORM:
     226            0 :     setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
     227            0 :     break;
     228         2294 :   case Initializer::XAVIER_UNIFORM:
     229         2294 :     setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
     230         2294 :                    sqrtFloat(6.0 / (fan_in + fan_out)));
     231         2294 :     break;
     232            0 :   case Initializer::HE_UNIFORM:
     233            0 :     setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
     234            0 :                    sqrtFloat(6.0 / (fan_in)));
     235              :   default:
     236              :     break;
     237              :   }
     238              : 
     239       360583 :   putData();
     240              : }
     241              : 
     242            7 : void FloatTensor::initialize(Initializer init) {
     243            7 :   initializer = init;
     244            7 :   initialize();
     245            7 : }
     246              : 
     247        70014 : Tensor &FloatTensor::apply(std::function<float(float)> f,
     248              :                            Tensor &output) const {
     249        83169 :   CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
     250              : 
     251        70014 :   if (contiguous && output.getContiguous()) {
     252        64137 :     const float *data = (float *)getData();
     253              :     float *rdata = output.getData<float>();
     254              : 
     255       128274 :     std::transform(data, data + size(), rdata, f);
     256         5877 :   } else if (strides[3] == 1 && output.getStrides()[3] == 1) {
     257              :     /** @todo optimize this with combining these loops where stride is 1 */
     258        12768 :     for (unsigned int b = 0; b < batch(); ++b) {
     259        13782 :       for (unsigned int c = 0; c < channel(); ++c) {
     260        13800 :         for (unsigned int h = 0; h < height(); ++h) {
     261         6909 :           float *out_data = output.getAddress<float>(b, c, h, 0);
     262         6909 :           const float *in_data = (float *)getAddress(getIndex(b, c, h, 0));
     263        20727 :           std::transform(in_data, in_data + width(), out_data, f);
     264              :         }
     265              :       }
     266              :     }
     267              :   } else {
     268            0 :     for (unsigned int b = 0; b < batch(); ++b) {
     269            0 :       for (unsigned int c = 0; c < channel(); ++c) {
     270            0 :         for (unsigned int h = 0; h < height(); ++h) {
     271            0 :           for (unsigned int w = 0; w < width(); ++w) {
     272            0 :             output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
     273              :           }
     274              :         }
     275              :       }
     276              :     }
     277              :   }
     278              : 
     279        70014 :   return output;
     280              : }
     281              : 
     282        12608 : Tensor FloatTensor::multiply_strided(Tensor const &m, Tensor &output,
     283              :                                      const float beta) const {
     284        12649 :   CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
     285              : 
     286        12608 :   if (size() != m.size() || size() != output.size())
     287              :     throw std::invalid_argument(
     288            3 :       "Strided multiplication does not support broadcasting");
     289              : 
     290        12606 :   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
     291              :     << getName() << " is not allocated";
     292        12605 :   NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
     293            1 :     << m.getName() << " is not allocated";
     294        12604 :   NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
     295            1 :     << output.getName() << " is not allocated";
     296              : 
     297        12602 :   if (strides[3] != 1 || m.getStrides()[3] != 1 ||
     298        37806 :       output.getStrides()[3] != 1 || std::fpclassify(beta) != FP_ZERO) {
     299         2834 :     for (unsigned int b = 0; b < batch(); ++b) {
     300         3048 :       for (unsigned int c = 0; c < channel(); ++c) {
     301         3054 :         for (unsigned int h = 0; h < height(); ++h) {
     302         5655 :           for (unsigned int w = 0; w < width(); ++w) {
     303         4125 :             output.addValue(
     304         4125 :               b, c, h, w, getValue(b, c, h, w) * m.getValue<float>(b, c, h, w),
     305              :               beta);
     306              :           }
     307              :         }
     308              :       }
     309              :     }
     310              :   } else {
     311              :     /** @todo optimize by combining these loops where stride is 1 */
     312        11292 :     if (getFormat() == Tformat::NCHW) {
     313        29292 :       for (unsigned int b = 0; b < batch(); ++b) {
     314        44925 :         for (unsigned int c = 0; c < channel(); ++c) {
     315       167899 :           for (unsigned int h = 0; h < height(); ++h) {
     316       140974 :             float *out_data = output.getAddress<float>(b, c, h, 0);
     317       140974 :             const float *m_data = m.getAddress<float>(b, c, h, 0);
     318       140974 :             const float *in_data = (float *)getAddress(getIndex(b, c, h, 0));
     319       140974 :             std::transform(in_data, in_data + width(), m_data, out_data,
     320              :                            std::multiplies<float>());
     321              :           }
     322              :         }
     323              :       }
     324              :     } else {
     325            0 :       for (unsigned int b = 0; b < batch(); ++b) {
     326            0 :         for (unsigned int h = 0; h < height(); ++h) {
     327            0 :           for (unsigned int w = 0; w < width(); ++w) {
     328            0 :             float *out_data = output.getAddress<float>(b, 0, h, w);
     329            0 :             const float *m_data = m.getAddress<float>(b, 0, h, w);
     330            0 :             const float *in_data = (float *)getAddress(getIndex(b, 0, h, w));
     331            0 :             std::transform(in_data, in_data + channel(), m_data, out_data,
     332              :                            std::multiplies<float>());
     333              :           }
     334              :         }
     335              :       }
     336              :     }
     337              :   }
     338              : 
     339        12602 :   return output;
     340              : }
     341              : 
     342         2609 : int FloatTensor::multiply_i(float const &value) {
     343         2609 :   float *data = (float *)getData();
     344         2609 :   unsigned int len = size();
     345              : 
     346         2609 :   sscal(len, value, data, 1);
     347              : 
     348         2609 :   return ML_ERROR_NONE;
     349              : }
     350              : 
     351         7691 : Tensor &FloatTensor::multiply(float const &value, Tensor &out) const {
     352              :   auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
     353         7691 :   apply(f, out);
     354         7691 :   return out;
     355              : }
     356              : 
     357        17213 : Tensor &FloatTensor::multiply(Tensor const &m, Tensor &output,
     358              :                               const float beta) const {
     359              :   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
     360              :                float *out_buf) {
     361        29863 :     ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
     362        29863 :             strides[3]);
     363              :   };
     364              : 
     365        17213 :   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
     366              :     << "Tensor Format of " << getName() << ":"
     367              :     << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
     368            0 :     << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
     369              : 
     370        17213 :   NNTR_THROW_IF(!contiguous || !m.getContiguous() || !output.getContiguous(),
     371              :                 std::invalid_argument)
     372              :     << getName() << " is not contiguous, cannot multiply";
     373              : 
     374        17213 :   NNTR_THROW_IF(!contiguous || !m.getContiguous() || !output.getContiguous(),
     375              :                 std::invalid_argument)
     376              :     << getName() << " is not contiguous, cannot multiply";
     377              : 
     378        17213 :   apply_broadcast(m, f, output);
     379        17074 :   return output;
     380              : }
     381              : 
     382         6273 : Tensor &FloatTensor::divide(float const &value, Tensor &output) const {
     383              :   auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
     384         6273 :   apply(f, output);
     385         6273 :   return output;
     386              : }
     387              : 
     388          198 : Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const {
     389              :   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
     390              :                float *out_buf) {
     391          333 :     ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
     392              :   };
     393              : 
     394          198 :   apply_broadcast(m, f, output);
     395          190 :   return output;
     396              : }
     397              : 
     398          211 : Tensor &FloatTensor::add_strided(Tensor const &input, Tensor &output,
     399              :                                  const float beta) const {
     400          212 :   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
     401              :     << getName() << " is not allocated";
     402          210 :   NNTR_THROW_IF(input.getData<float>() == nullptr, std::invalid_argument)
     403            0 :     << input.getName() << " is not allocated";
     404          211 :   NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
     405            1 :     << output.getName() << " is not allocated";
     406              : 
     407          209 :   if (strides[3] != 1 || input.getStrides()[3] != 1 ||
     408          627 :       output.getStrides()[3] != 1 || std::fpclassify(beta) != FP_ZERO) {
     409          112 :     for (unsigned int b = 0; b < batch(); ++b) {
     410          174 :       for (unsigned int c = 0; c < channel(); ++c) {
     411          198 :         for (unsigned int h = 0; h < height(); ++h) {
     412          567 :           for (unsigned int w = 0; w < width(); ++w) {
     413          459 :             output.setValue(b, c, h, w,
     414          459 :                             getValue(b, c, h, w) +
     415          459 :                               input.getValue<float>(b, c, h, w) * beta);
     416              :           }
     417              :         }
     418              :       }
     419              :     }
     420              :   } else {
     421              :     /** @todo optimize this with combining these loops where stride is 1 */
     422          181 :     if (this->getFormat() == Tformat::NCHW) {
     423          616 :       for (unsigned int b = 0; b < batch(); ++b) {
     424          876 :         for (unsigned int c = 0; c < channel(); ++c) {
     425          972 :           for (unsigned int h = 0; h < height(); ++h) {
     426          531 :             float *out_data = output.getAddress<float>(b, c, h, 0);
     427          531 :             const float *in_data = input.getAddress<float>(b, c, h, 0);
     428          531 :             const float *_data = (float *)getAddress(getIndex(b, c, h, 0));
     429          531 :             std::transform(_data, _data + width(), in_data, out_data,
     430              :                            std::plus<float>());
     431              :           }
     432              :         }
     433              :       }
     434              :     } else {
     435            0 :       for (unsigned int b = 0; b < batch(); ++b) {
     436            0 :         for (unsigned int h = 0; h < height(); ++h) {
     437            0 :           for (unsigned int w = 0; w < width(); ++w) {
     438            0 :             float *out_data = output.getAddress<float>(b, 0, h, w);
     439            0 :             const float *in_data = input.getAddress<float>(b, 0, h, w);
     440            0 :             const float *_data = (float *)getAddress(getIndex(b, 0, h, w));
     441            0 :             std::transform(_data, _data + channel(), in_data, out_data,
     442              :                            std::plus<float>());
     443              :           }
     444              :         }
     445              :       }
     446              :     }
     447              :   }
     448              : 
     449          209 :   return output;
     450              : }
     451              : 
     452         2698 : int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
     453              :                                Tensor &m, unsigned int incX, unsigned int incY,
     454              :                                const Tensor alphas, unsigned int alpha_idx) {
     455         2698 :   saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
     456         2698 :         (float *)getAddress(addr_idx), incY);
     457              : 
     458         2698 :   return ML_ERROR_NONE;
     459              : }
     460              : 
     461         7432 : Tensor &FloatTensor::add(float const &value, Tensor &output) const {
     462              :   auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
     463         7432 :   apply(f, output);
     464         7432 :   return output;
     465              : }
     466              : 
     467        59384 : Tensor &FloatTensor::add(Tensor const &m, Tensor &output,
     468              :                          float const alpha) const {
     469              :   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
     470              :                float *out_buf) {
     471       112969 :     ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
     472       112969 :             strides[3]);
     473              :   };
     474        59384 :   apply_broadcast(m, f, output);
     475        59353 :   return output;
     476              : }
     477              : 
     478          174 : Tensor &FloatTensor::subtract(float const &value, Tensor &output) const {
     479              :   auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
     480          174 :   apply(f, output);
     481          174 :   return output;
     482              : }
     483              : 
     484          453 : void FloatTensor::sum_by_batch(Tensor &output) const {
     485          453 :   size_t feat_len = dim.getFeatureLen();
     486          453 :   size_t batch = dim.batch();
     487              : 
     488          453 :   const float *data = (float *)getData();
     489              :   float *out_data = output.getData<float>();
     490              : 
     491          453 :   Tensor ones(1, 1, 1, feat_len, this->getFormat());
     492          453 :   ones.setValue(1.0);
     493          453 :   sgemv((unsigned int)dim.getStorageOrder(), false, (int)batch, (int)feat_len,
     494              :         1, data, (int)feat_len, ones.getData<float>(), 1, 0.0, out_data, 1);
     495          453 : }
     496              : 
     497       126666 : Tensor &FloatTensor::sum(unsigned int axis, Tensor &output, float alpha,
     498              :                          float beta) const {
     499       126666 :   const float *data = (float *)getData();
     500              : 
     501       126666 :   NNTR_THROW_IF(!contiguous, std::invalid_argument)
     502              :     << getName() << " is not contiguous, cannot sum";
     503              : 
     504       126666 :   if (axis >= 4)
     505            3 :     throw std::out_of_range("Error: axis is invalid");
     506              : 
     507       126663 :   if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
     508       111118 :     CREATE_IF_EMPTY_DIMS(output, dim);
     509        58861 :     scopy(size(), (float *)getData(), 1, output.getData<float>(), 1);
     510        58861 :     return output;
     511              :   }
     512              : 
     513        67802 :   switch (axis) {
     514         1654 :   case 0: {
     515         1939 :     CREATE_IF_EMPTY_DIMS(output, 1, dim.channel(), dim.height(), dim.width(),
     516              :                          getTensorType());
     517         1654 :     size_t feat_len = dim.getFeatureLen();
     518         1654 :     size_t batch = dim.batch();
     519         1654 :     Tensor ones(1, 1, 1, batch, getTensorType());
     520         1654 :     ones.setValue(alpha);
     521         1654 :     sgemv((unsigned int)dim.getStorageOrder(), true, (int)batch, (int)feat_len,
     522              :           1, data, (int)feat_len, ones.getData<float>(), 1, beta,
     523              :           output.getData<float>(), 1);
     524         1654 :   } break;
     525           64 :   case 1: {
     526           90 :     CREATE_IF_EMPTY_DIMS(output, dim[0], 1, dim[2], dim[3], getTensorType());
     527           64 :     if (this->getFormat() == Tformat::NHWC) {
     528            0 :       unsigned int feat_len = output.getDim().getDataLen();
     529            0 :       unsigned int t_axis = dim[1];
     530            0 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     531            0 :       ones.setValue(alpha);
     532            0 :       sgemv((unsigned int)dim.getStorageOrder(), false, (int)feat_len,
     533              :             (int)t_axis, 1, data, (int)t_axis, ones.getData<float>(), 1, beta,
     534              :             output.getData<float>(), 1);
     535            0 :     } else {
     536           64 :       unsigned int feat_len = dim[2] * dim[3];
     537           64 :       unsigned int t_axis = dim[1];
     538           64 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     539           64 :       ones.setValue(alpha);
     540              :       float *rdata = output.getData<float>();
     541          218 :       for (unsigned int k = 0; k < dim[0]; ++k) {
     542          154 :         sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     543          154 :               (int)feat_len, 1, &data[k * dim.getFeatureLen()], (int)feat_len,
     544          154 :               ones.getData<float>(), 1, beta, &rdata[k * feat_len], 1);
     545              :       }
     546           64 :     }
     547              :   } break;
     548         6658 :   case 2: {
     549        13211 :     CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], 1, dim[3], getTensorType());
     550              :     if (this->getFormat() == Tformat::NHWC) {
     551            0 :       unsigned int feat_len = dim[1] * dim[3];
     552            0 :       unsigned int t_axis = dim[2];
     553            0 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     554            0 :       ones.setValue(alpha);
     555              :       float *rdata = output.getData<float>();
     556            0 :       for (unsigned int k = 0; k < dim[0]; ++k) {
     557            0 :         sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     558            0 :               (int)feat_len, 1, &data[k * dim.getFeatureLen()], (int)feat_len,
     559            0 :               ones.getData<float>(), 1, beta, &rdata[k * feat_len], 1);
     560              :       }
     561            0 :     } else {
     562         6658 :       unsigned int t_3 = dim[3];
     563         6658 :       unsigned int t_axis = dim[2];
     564         6658 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     565         6658 :       ones.setValue(alpha);
     566              : 
     567         6658 :       if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
     568              :         float *rdata = output.getData<float>();
     569        13364 :         for (unsigned int k = 0; k < dim[0]; ++k) {
     570        13516 :           for (unsigned int c = 0; c < dim[1]; ++c) {
     571         6810 :             unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
     572              :             unsigned int ridx =
     573         6810 :               k * output.getDim().getFeatureLen() + c * dim[3];
     574              : 
     575        13620 :             sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     576         6810 :                   (int)t_3, 1, &data[idx], (int)t_3, ones.getData<float>(), 1,
     577         6810 :                   beta, &rdata[ridx], 1);
     578              :           }
     579              :         }
     580              :       } else {
     581            0 :         sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     582            0 :               (int)output.getDim().getDataLen(), 1, data, (int)t_axis,
     583              :               ones.getData<float>(), 1, beta, output.getData<float>(), 1);
     584              :       }
     585         6658 :     }
     586              :   } break;
     587        59426 :   case 3: {
     588       117785 :     CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], dim[2], 1,
     589              :                          this->getTensorType());
     590        59426 :     if (this->getFormat() == Tformat::NHWC) {
     591            0 :       unsigned int t_3 = dim[1];
     592            0 :       unsigned int t_axis = dim[3];
     593            0 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     594            0 :       ones.setValue(alpha);
     595              :       float *rdata = output.getData<float>();
     596            0 :       for (unsigned int k = 0; k < dim[0]; ++k) {
     597            0 :         for (unsigned int c = 0; c < dim[2]; ++c) {
     598            0 :           unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
     599            0 :           unsigned int ridx = k * output.getDim().getFeatureLen() + c * dim[1];
     600            0 :           sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     601            0 :                 (int)t_3, 1, &data[idx], (int)t_3, ones.getData<float>(), 1,
     602            0 :                 beta, &rdata[ridx], 1);
     603              :         }
     604              :       }
     605            0 :     } else {
     606        59426 :       unsigned int m = output.getDim().getDataLen();
     607        59426 :       unsigned int n = dim[3];
     608        59426 :       Tensor ones(1, 1, 1, n, getTensorType());
     609        59426 :       ones.setValue(alpha);
     610              : 
     611        59426 :       if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
     612        59426 :         sgemv((unsigned int)dim.getStorageOrder(), false, (int)m, (int)n, 1,
     613              :               data, (int)n, ones.getData<float>(), 1, beta,
     614              :               output.getData<float>(), 1);
     615              :       } else {
     616              :         float *rdata = output.getData<float>();
     617              : 
     618            0 :         for (unsigned int k = 0; k < dim[0]; ++k) {
     619            0 :           for (unsigned int c = 0; c < dim[1]; ++c) {
     620            0 :             unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
     621            0 :             unsigned int ridx = k * dim[1] * dim[2] + c * dim[2];
     622              : 
     623            0 :             sgemv((unsigned int)dim.getStorageOrder(), false, (int)dim[2],
     624            0 :                   (int)n, 1, &data[idx], (int)dim[2], ones.getData<float>(), 1,
     625            0 :                   beta, &rdata[ridx], 1);
     626              :           }
     627              :         }
     628              :       }
     629        59426 :     }
     630              :   } break;
     631              :   default:
     632              :     throw std::out_of_range("Error: Dimension cannot exceed 3");
     633              :   }
     634              : 
     635              :   return output;
     636              : }
     637              : 
     638            0 : Tensor &FloatTensor::abs(Tensor &output) const {
     639              :   auto f = [](float in) { return std::abs(in); };
     640            0 :   apply(f, output);
     641            0 :   return output;
     642              : }
     643              : 
     644         1931 : float FloatTensor::l2norm() const {
     645         1931 :   return snrm2(size(), (float *)getData(), 1);
     646              : }
     647              : 
     648          790 : Tensor &FloatTensor::pow(float exponent, Tensor &output) const {
     649       128994 :   auto f = [exponent](float in) { return powf(in, exponent); };
     650          790 :   apply(f, output);
     651          790 :   return output;
     652              : }
     653              : 
     654            6 : Tensor &FloatTensor::sqrt(Tensor &output) const {
     655              :   auto f = [](float in) { return std::sqrt(in); };
     656            6 :   apply(f, output);
     657            6 :   return output;
     658              : }
     659              : 
     660            1 : Tensor &FloatTensor::erf(Tensor &output) const {
     661              :   auto f = [](float in) { return std::erf(in); };
     662            1 :   apply(f, output);
     663            1 :   return output;
     664              : }
     665              : 
     666           11 : void FloatTensor::sin(Tensor &out, float alpha) {
     667           11 :   if (!contiguous) {
     668           90 :     auto f = [alpha](float val) -> float { return std::sin(alpha * val); };
     669            2 :     apply(f, out);
     670              :   } else {
     671           10 :     sine(size(), (float *)getData(), out.getData<float>(), alpha);
     672              :   }
     673           11 : }
     674              : 
     675           14 : void FloatTensor::cos(Tensor &out, float alpha) {
     676           14 :   if (!contiguous) {
     677           90 :     auto f = [alpha](float val) -> float { return std::cos(alpha * val); };
     678            2 :     apply(f, out);
     679              :   } else {
     680           13 :     cosine(size(), (float *)getData(), out.getData<float>(), alpha);
     681              :   }
     682           14 : }
     683              : 
     684            6 : void FloatTensor::tan(Tensor &output, float alpha) {
     685           12 :   auto f = [alpha](float val) -> float { return std::tan(alpha * val); };
     686            6 :   apply(f, output);
     687            6 : }
     688              : 
     689            4 : void FloatTensor::inv_sqrt(Tensor &out) {
     690            4 :   apply([](float val) -> float { return 1 / std::sqrt(val); }, out);
     691            4 : }
     692              : 
     693        36830 : Tensor &FloatTensor::dot(Tensor const &input, Tensor &output, bool trans,
     694              :                          bool trans_in, float beta) const {
     695              :   /**
     696              :    * @note FP32.dot(input);
     697              :    * according to the input type, invoked kernels can be varied.
     698              :    */
     699        36830 :   switch (input.getDataType()) {
     700              :   /** applying sgemm/sgemv after type casting to FP32 */
     701        36824 :   case Tdatatype::FP32:
     702        36824 :     dotFloat(input, output, trans, trans_in, beta);
     703        36821 :     break;
     704            0 :   case Tdatatype::FP16:
     705            0 :     dotFloat32Float16(input, output, trans, trans_in, beta);
     706            0 :     break;
     707              :   /** applying gemm_q4_k / gemm_q6_k / gemm_q4_0 */
     708            6 :   case Tdatatype::Q4_K:
     709              :   case Tdatatype::Q6_K:
     710              :   case Tdatatype::Q4_0:
     711            6 :     dotQnK(input, output, trans, trans_in, beta, input.getDataType());
     712            6 :     break;
     713            0 :   case Tdatatype::QINT16:
     714              :   case Tdatatype::QINT8:
     715              :   case Tdatatype::QINT4:
     716            0 :     dotQInteger(input, output, trans, trans_in, beta, input.getDataType());
     717            0 :     break;
     718            0 :   default:
     719            0 :     throw std::invalid_argument("Error: unsupported datatype");
     720              :   }
     721        36827 :   return output;
     722              : }
     723              : 
     724            0 : void FloatTensor::dot(std::vector<Tensor *> input, std::vector<Tensor *> output,
     725              :                       bool trans, bool trans_in, float beta) const {
     726            0 :   float *data = (float *)getData();
     727            0 :   unsigned int M = getDim().height();
     728            0 :   unsigned int K = getDim().width();
     729            0 :   Tdatatype input_dtype = input[0]->getDataType();
     730              : 
     731              :   // Handle standard inputs
     732            0 :   if (input_dtype != Tdatatype::Q4_0 && input_dtype != Tdatatype::QINT4) {
     733            0 :     for (unsigned int i = 0; i < input.size(); ++i) {
     734            0 :       dot(*input[i], *output[i], trans, trans_in, beta);
     735              :     }
     736            0 :     return;
     737              :   }
     738              : 
     739              :   std::vector<unsigned int> Ns;
     740              :   std::vector<void *> mdatas;
     741              :   std::vector<float *> rdatas;
     742              : 
     743            0 :   for (unsigned int i = 0; i < input.size(); ++i) {
     744            0 :     Ns.push_back(input[i]->getDim().width());
     745            0 :     mdatas.push_back((void *)input[i]->getData<uint8_t>());
     746            0 :     rdatas.push_back(output[i]->getData<float>());
     747              :   }
     748              : 
     749              : #ifdef ENABLE_OPENCL
     750              :   if (input_dtype == Tdatatype::Q4_0) {
     751              :     if (M == 1) {
     752              :       for (unsigned int i = 0; i < input.size(); ++i) {
     753              :         gemm_q4_0(M, Ns[i], K, data, K, mdatas[i], Ns[i], rdatas[i], Ns[i]);
     754              :       }
     755              :     } else {
     756              :       gemm_q4_0_async_cl(mdatas, data, rdatas, M, Ns, K);
     757              :     }
     758              :   } else { // QINT4
     759              :     /// Run on GPU only when memory is a Shared Virual Memory
     760              :     if (input[0]->getMemoryData()->isSVM() &&
     761              :         output[0]->getMemoryData()->isSVM() && getMemoryData()->isSVM()) {
     762              :       std::vector<uint16_t *> scales;
     763              :       for (unsigned int i = 0; i < input.size(); ++i) {
     764              :         scales.push_back(input[i]->getScale<uint16_t>());
     765              :       }
     766              :       if (M == 1) {
     767              :         gemv_int4_async_cl(mdatas, scales, data, rdatas, K, Ns,
     768              :                            Int4QTensor::getGroupSize());
     769              :       } else {
     770              :         openvino_gemm_async_cl(data, mdatas, scales, rdatas, M, Ns, K,
     771              :                                Int4QTensor::getGroupSize());
     772              :       }
     773              :     } else {
     774              :       /// @todo This should be replaced with standard CPU INT4 computation
     775              :       for (unsigned int i = 0; i < input.size(); ++i) {
     776              :         gemm_q4_0(M, Ns[i], K, data, K, (void *)input[i]->getData(), Ns[i],
     777              :                   rdatas[i], Ns[i]);
     778              :       }
     779              :     }
     780              :   }
     781              : #else
     782            0 :   if (input_dtype == Tdatatype::Q4_0) {
     783              :     /// @todo Support multi-weight q4_0 for x64
     784            0 :     for (unsigned int i = 0; i < input.size(); ++i) {
     785            0 :       gemm_q4_0(M, Ns[i], K, data, K, mdatas[i], Ns[i], rdatas[i], Ns[i]);
     786              :     }
     787              :   } else { // QINT4
     788              :     /// @note It is essential to understand that this section of the code
     789              :     /// requires the `input` data to be converted to Q4_0 type, not QINT4 type.
     790              :     /// This should be replaced with standard CPU INT4 computation instead of
     791              :     /// using Q4_0.
     792            0 :     for (unsigned int i = 0; i < input.size(); ++i) {
     793            0 :       gemm_q4_0(M, Ns[i], K, data, K, (void *)input[i]->getData(), Ns[i],
     794              :                 rdatas[i], Ns[i]);
     795              :     }
     796              :   }
     797              : #endif
     798            0 : }
     799              : 
     800        36824 : Tensor &FloatTensor::dotFloat(Tensor const &input, Tensor &output, bool trans,
     801              :                               bool trans_in, float beta) const {
     802              :   // Comment out with intension to support the calculation wrt. batch and
     803              :   // height direction. It supposes to have this->dim as [ BxCxH,W ] and
     804              :   // input.dim is [BxCxH,W] as well if (input.dim.rank() > 2) {
     805              :   //   throw exception::not_supported("Error: support only for rank of dot "
     806              :   //                                  "matrix <= 2");
     807              :   // }
     808              : 
     809              :   // Comment out with intension to support the calculation wrt. batch and
     810              :   // height direction of this tensor. It is OK as long as input is 2D
     811        36824 :   if (trans && dim.rank() > 2) {
     812          932 :     ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
     813              :   }
     814              :   unsigned int first_three_flat, last_axis, input_first_three_flat,
     815              :     input_last_axis, M, N, K, lda, ldb, ldc;
     816              : 
     817        36824 :   calculateFlattenDot(input, output, trans, trans_in, first_three_flat,
     818              :                       last_axis, input_first_three_flat, input_last_axis, M, N,
     819              :                       K, lda, ldb, ldc);
     820              : 
     821        36821 :   const float *data = (float *)getData();
     822              :   const float *mdata = input.getData<float>();
     823              :   float *rdata = output.getData<float>();
     824              :   const float alpha = 1.0f;
     825              : 
     826              :   /// shortcut handling in case of vector
     827              :   /// for vector, (1 * K) == (K * 1) in current memory layout...
     828              :   /// and please note that N, K, M is a fixed place holder after considering
     829              :   /// transpose.
     830              :   /// For example, there is no case like (1 * K) X (1 * K) while
     831              :   /// (1 * K) X (1 * M) can be a case
     832              :   /// case1: (1 * K) X (K * 1)
     833        36821 :   if (M == 1 && N == 1) {
     834          182 :     *rdata =
     835          182 :       sdot(K, data, 1, mdata, 1) + ((0.0f == beta) ? 0.0f : beta * *rdata);
     836              :   }
     837              :   /// case2: (M * K) X (K * 1)
     838        36639 :   else if (N == 1) {
     839        10955 :     sgemv((unsigned int)dim.getStorageOrder(), trans, first_three_flat,
     840              :           last_axis, alpha, data, lda, mdata, 1, beta, rdata, 1);
     841              :   }
     842              :   /// case3: (1 * K) X (K * N) = 1 * N = R
     843              :   /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
     844              :   /// Effectively a translation of sgemv
     845        25684 :   else if (M == 1) {
     846         8736 :     sgemv((unsigned int)dim.getStorageOrder(), !trans_in,
     847              :           input_first_three_flat, input_last_axis, alpha, mdata, ldb, data, 1,
     848              :           beta, rdata, 1);
     849              :   }
     850              :   /// case others: use gemm
     851              :   else {
     852        16948 :     sgemm((unsigned int)dim.getStorageOrder(), trans, trans_in, M, N, K, alpha,
     853              :           data, lda, mdata, ldb, beta, rdata, ldc);
     854              :   }
     855              : 
     856        36821 :   return output;
     857              : }
     858              : 
     859            0 : Tensor &FloatTensor::dotFloat32Float16(Tensor const &input, Tensor &output,
     860              :                                        bool trans, bool trans_in,
     861              :                                        float beta) const {
     862              : /// @todo remove #ifdef ENABLE_FP16
     863              : #ifdef ENABLE_FP16
     864              : 
     865              :   // Comment out with intension to support the calculation wrt. batch and
     866              :   // height direction. It supposes to have this->dim as [ BxCxH,W ] and
     867              :   // input.dim is [BxCxH,W] as well if (input.dim.rank() > 2) {
     868              :   //   throw exception::not_supported("Error: support only for rank of dot "
     869              :   //                                  "matrix <= 2");
     870              :   // }
     871              : 
     872              :   // Comment out with intension to support the calculation wrt. batch and
     873              :   // height direction of this tensor. It is OK as long as input is 2D
     874              :   if (trans && dim.rank() > 2) {
     875              :     ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
     876              :   }
     877              :   unsigned int first_three_flat, last_axis, input_first_three_flat,
     878              :     input_last_axis, M, N, K, lda, ldb, ldc;
     879              : 
     880              :   calculateFlattenDot(input, output, trans, trans_in, first_three_flat,
     881              :                       last_axis, input_first_three_flat, input_last_axis, M, N,
     882              :                       K, lda, ldb, ldc);
     883              : 
     884              :   const float *data = (float *)getData();
     885              :   const _FP16 *mdata = input.getData<_FP16>();
     886              :   float *rdata = output.getData<float>();
     887              :   const float alpha = 1.0f;
     888              : 
     889              :   /// shortcut handling in case of vector
     890              :   /// for vector, (1 * K) == (K * 1) in current memory layout...
     891              :   /// and please note that N, K, M is a fixed place holder after considering
     892              :   /// transpose.
     893              :   /// For example, there is no case like (1 * K) X (1 * K) while
     894              :   /// (1 * K) X (1 * M) can be a case
     895              :   /// case1: (1 * K) X (K * 1)
     896              :   NNTR_THROW_IF((M == 1 && N == 1), std::invalid_argument)
     897              :     << "dotQnK does not support trans / trans_in";
     898              :   /// case2: (M * K) X (K * 1)
     899              :   if (N == 1) {
     900              :     shgemv((unsigned int)dim.getStorageOrder(), trans, first_three_flat,
     901              :            last_axis, alpha, data, lda, mdata, 1, beta, rdata, 1);
     902              :   }
     903              :   /// case3: (1 * K) X (K * N) = 1 * N = R
     904              :   /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
     905              :   /// Effectively a translation of sgemv
     906              :   else if (M == 1) {
     907              :     hsgemv((unsigned int)dim.getStorageOrder(), !trans_in,
     908              :            input_first_three_flat, input_last_axis, alpha, mdata, ldb, data, 1,
     909              :            beta, rdata, 1);
     910              :   }
     911              :   /// case others: use gemm
     912              :   else {
     913              :     shgemm((unsigned int)dim.getStorageOrder(), trans, trans_in, M, N, K, alpha,
     914              :            data, lda, mdata, ldb, beta, rdata, ldc);
     915              :   }
     916              : 
     917              :   return output;
     918              : #else
     919            0 :   throw std::invalid_argument("Error: enable-fp16 is not enabled");
     920              : #endif
     921              : }
     922              : 
     923            6 : Tensor &FloatTensor::dotQnK(Tensor const &input, Tensor &output, bool trans,
     924              :                             bool trans_in, float beta, Tdatatype dtype) const {
     925              :   ///@note Be cautious.
     926              :   /// Qn_K does not support transpose in principle.
     927              :   /// This trans option only aims to support Tensor Dimension only,
     928              :   /// not data.
     929              :   ///@note trans is not yet applied
     930            6 :   NNTR_THROW_IF(trans, std::invalid_argument)
     931              :     << "dotQnK does not support trans";
     932              : 
     933            6 :   float *data = (float *)getData();
     934              :   uint8_t *mdata = input.getData<uint8_t>();
     935              :   float *rdata = output.getData<float>();
     936              : 
     937              :   unsigned int M, N, K;
     938            6 :   M = getDim().height();
     939            6 :   K = getDim().width();
     940            6 :   N = trans_in ? input.getDim().height() : input.getDim().width();
     941              : 
     942            6 :   switch (dtype) {
     943            3 :   case Tdatatype::Q4_K:
     944            3 :     gemm_q4_K(M, N, K, data, K, (void *)mdata, N, rdata, N);
     945            3 :     break;
     946            2 :   case Tdatatype::Q6_K:
     947            2 :     gemm_q6_K(M, N, K, data, K, (void *)mdata, N, rdata, N);
     948            2 :     break;
     949            1 :   case Tdatatype::Q4_0:
     950            1 :     M = getDim().height();
     951            1 :     K = getDim().width();
     952            1 :     N = input.getDim().width();
     953              : #ifdef ENABLE_OPENCL
     954              :     if (M == 1) {
     955              :       gemm_q4_0(M, N, K, data, K, (void *)mdata, N, rdata, N);
     956              :     } else {
     957              :       gemm_q4_0_cl((void *)mdata, data, rdata, M, N, K);
     958              :     }
     959              : #else
     960            1 :     gemm_q4_0(M, N, K, data, K, (void *)mdata, N, rdata, N);
     961              : #endif
     962            1 :     break;
     963              : 
     964            0 :   default:
     965            0 :     throw std::invalid_argument("Error: unsupported datatype");
     966              :   }
     967              : 
     968            6 :   return output;
     969              : }
     970              : 
     971            0 : Tensor &FloatTensor::dotQInteger(Tensor const &input, Tensor &output,
     972              :                                  bool trans, bool trans_in, float beta,
     973              :                                  Tdatatype dtype) const {
     974              : 
     975            0 :   float *data = (float *)getData();
     976              :   char *mdata = input.getData<char>();
     977              :   float *rdata = output.getData<float>();
     978              : 
     979            0 :   unsigned int M = getDim().height();
     980            0 :   unsigned int K = getDim().width();
     981            0 :   unsigned int N = output.getDim().width();
     982              : 
     983              : #ifndef ENABLE_OPENCL
     984              : #ifdef ENABLE_FP16
     985              :   if (input.q_scheme() == QScheme::PER_CHANNEL_AFFINE) {
     986              :     uint32_t opt_kernel_idx = (M == 1) ? 1 : 5;
     987              :     nntr_gemm_qai8dxp_qsi4cxp_packed(
     988              :       M, N, K, (void *)data, (void *)mdata, rdata, opt_kernel_idx,
     989              :       true); /// @todo kernel supports both trans / noTrans situation
     990              :   } else {
     991              :     throw std::runtime_error(
     992              :       "Error: QINT4 Dot on CPU only supports PER_CHANNEL_AFFINE scheme");
     993              :   }
     994              : #else
     995              :   /// @note It is essential to understand that this section of the code requires
     996              :   /// the `input` data to be converted to Q4_0 type, not QINT4 type. This should
     997              :   /// be replaced with standard CPU INT4 computation instead of using Q4_0.
     998            0 :   gemm_q4_0(M, N, K, data, K, (void *)input.getData(), N, rdata, N);
     999              : #endif
    1000              : #else
    1001              :   if (input.getMemoryData()->isSVM() && output.getMemoryData()->isSVM() &&
    1002              :       getMemoryData()->isSVM()) {
    1003              :     if (M == 1) {
    1004              :       gemv_int4_cl(mdata, input.getScale<uint16_t>(), data, rdata, K, N,
    1005              :                    Int4QTensor::getGroupSize());
    1006              :     } else {
    1007              :       openvino_sgemm_cl(data, mdata, input.getScale<uint16_t>(), rdata, M, N, K,
    1008              :                         Int4QTensor::getGroupSize());
    1009              :     }
    1010              :   } else {
    1011              :     /// @todo This should be replaced with standard CPU INT4 computation
    1012              :     gemm_q4_0(M, N, K, data, K, (void *)input.getData(), N, rdata, N);
    1013              :   }
    1014              : #endif
    1015              : 
    1016            0 :   return output;
    1017              : }
    1018              : 
    1019        84504 : void FloatTensor::copy(const Tensor &from) {
    1020        84504 :   reshape(from.getDim());
    1021        84504 :   copy(from.getData<float>());
    1022        84504 : }
    1023              : 
    1024        13730 : void FloatTensor::copyData(const Tensor &from) {
    1025        13730 :   NNTR_THROW_IF(!contiguous, std::invalid_argument)
    1026              :     << getName() << " is not contiguous, cannot copy.";
    1027              : 
    1028        13730 :   NNTR_THROW_IF(size() != from.size(), std::invalid_argument)
    1029              :     << "Size of tensor to copy must match";
    1030              : 
    1031        13730 :   switch (from.getDataType()) {
    1032              :   case ml::train::TensorDim::DataType::FP32:
    1033        13721 :     copy(from.getData<float>());
    1034        13721 :     break;
    1035            0 :   case ml::train::TensorDim::DataType::FP16:
    1036              : /// @todo remove #ifdef ENABLE_FP16
    1037              : #ifdef ENABLE_FP16
    1038              :     scopy(size(), from.getData<_FP16>(), 1, (float *)getData(), 1);
    1039              : #else
    1040            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
    1041              : #endif
    1042              :     break;
    1043            1 :   case ml::train::TensorDim::DataType::QINT16:
    1044            2 :     copy_s16_fp32(from.size(), from.getData<int16_t>(), (float *)getData());
    1045            1 :     break;
    1046            5 :   case ml::train::TensorDim::DataType::QINT8:
    1047            5 :     scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
    1048            5 :                           (float *)getData(), 1);
    1049            5 :     break;
    1050            1 :   case ml::train::TensorDim::DataType::UINT16:
    1051            2 :     copy_u16_fp32(from.size(), from.getData<uint16_t>(), (float *)getData());
    1052            1 :     break;
    1053            2 :   case ml::train::TensorDim::DataType::UINT8:
    1054            2 :     scopy_int8_to_float32(from.size(), from.getData<uint8_t>(), 1,
    1055            2 :                           (float *)getData(), 1);
    1056            2 :     break;
    1057            0 :   default:
    1058              :     throw std::invalid_argument(
    1059            0 :       "[FloatTensor::copyData] Error: Unsupported data type");
    1060              :     break;
    1061              :   }
    1062        13730 : }
    1063              : 
    1064         3509 : void FloatTensor::copy_with_stride(const Tensor &input, Tensor &output) {
    1065         7132 :   for (unsigned int b = 0; b < output.batch(); ++b) {
    1066         7246 :     for (unsigned int c = 0; c < output.channel(); ++c) {
    1067        16688 :       for (unsigned int h = 0; h < output.height(); ++h) {
    1068        91392 :         for (unsigned int w = 0; w < output.width(); ++w) {
    1069        78327 :           output.setValue(b, c, h, w, input.getValue<float>(b, c, h, w));
    1070              :         }
    1071              :       }
    1072              :     }
    1073              :   }
    1074         3509 : }
    1075              : 
    1076          648 : std::vector<unsigned int> FloatTensor::argmax() const {
    1077              :   std::vector<unsigned int> result;
    1078          648 :   const float *data = (float *)getData();
    1079              :   size_t batch_size = batch();
    1080          648 :   size_t feature_len = dim.getFeatureLen();
    1081              : 
    1082          648 :   result.resize(batch_size);
    1083              : 
    1084         8198 :   for (unsigned int b = 0; b < batch_size; b++) {
    1085              :     auto max_iter =
    1086         7550 :       std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
    1087         7550 :     result[b] = std::distance(data, max_iter) - (b * feature_len);
    1088              :   }
    1089          648 :   return result;
    1090            0 : }
    1091              : 
    1092            0 : std::vector<unsigned int> FloatTensor::argmin() const {
    1093              :   std::vector<unsigned int> result;
    1094            0 :   const float *data = (float *)getData();
    1095              :   size_t batch_size = batch();
    1096            0 :   size_t feature_len = dim.getFeatureLen();
    1097              : 
    1098            0 :   result.resize(batch_size);
    1099              : 
    1100            0 :   for (unsigned int b = 0; b < batch_size; b++) {
    1101              :     auto min_iter =
    1102            0 :       std::min_element(data + b * feature_len, data + (b + 1) * feature_len);
    1103            0 :     result[b] = std::distance(data, min_iter) - (b * feature_len);
    1104              :   }
    1105            0 :   return result;
    1106            0 : }
    1107              : 
    1108            6 : void FloatTensor::topK(unsigned int k, void *output_data,
    1109              :                        uint32_t *indices_data) {
    1110              :   const auto &input_dim = getDim();
    1111              :   const Tformat format = input_dim.getFormat();
    1112            6 :   const auto batch = input_dim.batch();
    1113            6 :   const auto channel = input_dim.channel();
    1114            6 :   const auto height = input_dim.height();
    1115            6 :   const auto width = input_dim.width();
    1116              : 
    1117            6 :   if (k == 0 || k > width) {
    1118              :     throw std::invalid_argument(
    1119            0 :       "k must be greater than 0 and less than or equal to width");
    1120              :   }
    1121              : 
    1122              :   float *output_buffer = static_cast<float *>(output_data);
    1123              : 
    1124              :   // Calculate strides for input and output
    1125            6 :   const auto input_strides = input_dim.computeStrides();
    1126            6 :   TensorDim output_dim = input_dim;
    1127            6 :   output_dim.width(k);
    1128            6 :   const auto output_strides = output_dim.computeStrides();
    1129              : 
    1130              : #ifdef _MSC_VER
    1131              : #pragma warning(push)
    1132              : #pragma warning(disable : 4849)
    1133              : #endif
    1134            6 : #pragma omp parallel for collapse(3)
    1135              : #ifdef _MSC_VER
    1136              : #pragma warning(pop)
    1137              : #endif
    1138              :   for (int b = 0; b < static_cast<int>(batch); ++b) {
    1139              :     for (int c = 0; c < static_cast<int>(channel); ++c) {
    1140              :       for (int h = 0; h < static_cast<int>(height); ++h) {
    1141              : 
    1142              :         size_t offset;
    1143              :         if (format == Tformat::NCHW) {
    1144              :           // NCHW: [b][c][h][i]
    1145              :           offset =
    1146              :             b * input_strides[0] + c * input_strides[1] + h * input_strides[2];
    1147              :         } else {
    1148              :           // NHWC: [b][h][i][c]
    1149              :           offset = b * input_strides[0] + h * input_strides[1] + c;
    1150              :         }
    1151              : 
    1152              :         const unsigned int width_stride =
    1153              :           format == Tformat::NHWC ? input_strides[2] : 1;
    1154              :         const float *B = static_cast<const float *>(getData()) + offset;
    1155              :         std::vector<size_t> idx(width);
    1156              :         std::iota(idx.begin(), idx.end(), 0);
    1157              :         std::partial_sort(idx.begin(), idx.begin() + k, idx.end(),
    1158              :                           [&B, width_stride](size_t i1, size_t i2) {
    1159          103 :                             return B[i1 * width_stride] > B[i2 * width_stride];
    1160              :                           });
    1161              : 
    1162              :         // write top-k values and their indices to output
    1163              :         for (unsigned int i = 0; i < k; ++i) {
    1164              :           size_t output_idx;
    1165              :           if (format == Tformat::NCHW) {
    1166              :             // NCHW: [b][c][h][i]
    1167              :             output_idx = b * output_strides[0] + c * output_strides[1] +
    1168              :                          h * output_strides[2] + i;
    1169              :           } else {
    1170              :             // NHWC: [b][h][i][c]
    1171              :             output_idx = b * output_strides[0] + h * output_strides[1] +
    1172              :                          i * output_strides[2] + c;
    1173              :           }
    1174              :           output_buffer[output_idx] = B[idx[i]];
    1175              :           indices_data[output_idx] = static_cast<uint32_t>(idx[i]);
    1176              :         }
    1177              :       }
    1178              :     }
    1179              :   }
    1180            6 : }
    1181              : 
    1182            3 : float FloatTensor::max_abs() const {
    1183            3 :   const float *data = (float *)getData();
    1184            3 :   unsigned int idx = isamax(size(), data, 1);
    1185            3 :   return *(data + idx);
    1186              : }
    1187              : 
    1188          169 : float FloatTensor::maxValue() const {
    1189          169 :   const float *data = (float *)getData();
    1190          169 :   return *std::max_element(data, data + size());
    1191              : }
    1192              : 
    1193          169 : float FloatTensor::minValue() const {
    1194          169 :   const float *data = (float *)getData();
    1195          169 :   return *std::min_element(data, data + size());
    1196              : }
    1197              : 
    1198         1114 : Tensor &FloatTensor::transpose(const std::string &direction,
    1199              :                                Tensor &output) const {
    1200              :   unsigned int SL, SI, SJ, SK;
    1201              : 
    1202         1114 :   output.reshape(dim.transpose(direction));
    1203              : 
    1204         1113 :   int indexI = direction[0] - '0';
    1205         1113 :   int indexJ = direction[2] - '0';
    1206              : 
    1207         1113 :   SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
    1208              : 
    1209              :   bool is_format_nchw = (getFormat() == Tformat::NCHW);
    1210              : 
    1211         1113 :   const float *inptr = (float *)getData();
    1212              :   float *outptr = output.getData<float>();
    1213         1113 :   switch (indexI) {
    1214           17 :   case 0:
    1215           17 :     if (indexJ == 1) {
    1216            2 :       if (is_format_nchw) {
    1217          308 :         transposeloop(l, i, j, k, SL, SI, SJ, SK);
    1218              :       } else {
    1219            0 :         transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
    1220              :       }
    1221              :     } else {
    1222           15 :       if (is_format_nchw) {
    1223           34 :         for (unsigned int b = 0; b < batch(); ++b) {
    1224           44 :           for (unsigned int c = 0; c < channel(); ++c) {
    1225           50 :             transpose_matrix(
    1226           25 :               height(), width(), (float *)getData() + getIndex(b, c, 0, 0),
    1227           25 :               width(), (float *)output.getData() + output.getIndex(b, c, 0, 0),
    1228           25 :               output.width());
    1229              :           }
    1230              :         }
    1231              :       } else {
    1232            0 :         transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
    1233              :       }
    1234              :     }
    1235              :     break;
    1236         1092 :   case 1:
    1237         1092 :     if (indexJ == 0) {
    1238         1086 :       if (is_format_nchw) {
    1239       118863 :         transposeloop(l, j, i, k, SL, SJ, SI, SK);
    1240              :       } else {
    1241            0 :         transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
    1242              :       }
    1243              :     } else {
    1244            6 :       if (is_format_nchw) {
    1245         2928 :         transposeloop(l, j, k, i, SL, SJ, SK, SI);
    1246              :       } else {
    1247            0 :         transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
    1248              :       }
    1249              :     }
    1250              :     break;
    1251            4 :   case 2:
    1252            4 :     if (indexJ == 0) {
    1253            2 :       if (is_format_nchw) {
    1254          338 :         transposeloop(l, k, i, j, SL, SK, SI, SJ);
    1255              :       } else {
    1256            0 :         transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
    1257              :       }
    1258              :     } else {
    1259            2 :       if (is_format_nchw) {
    1260          398 :         transposeloop(l, k, j, i, SL, SK, SJ, SI);
    1261              :       } else {
    1262            0 :         transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
    1263              :       }
    1264              :     }
    1265              :     break;
    1266              :   }
    1267              : 
    1268         1113 :   return output;
    1269              : }
    1270              : 
    1271           10 : void FloatTensor::dropout_mask(float dropout) {
    1272           10 :   float scale = 1.0 / (1 - dropout);
    1273           10 :   float *data_ = (float *)getData();
    1274          370 :   for (unsigned int i = 0; i < size(); ++i) {
    1275          360 :     if (data_[i] >= dropout)
    1276          148 :       data_[i] = scale;
    1277              :     else
    1278          212 :       data_[i] = 0.0;
    1279              :   }
    1280           10 : }
    1281              : 
    1282            0 : void FloatTensor::filter_mask(const Tensor &mask_len, bool reverse) {
    1283              :   float fill_mask_val = 0.0;
    1284              :   float en_mask_val = 1.0 - fill_mask_val;
    1285              : 
    1286            0 :   if (reverse) {
    1287              :     fill_mask_val = 1.0;
    1288              :     en_mask_val = 1.0 - fill_mask_val;
    1289              :   }
    1290              : 
    1291            0 :   setValue(fill_mask_val);
    1292              : 
    1293            0 :   NNTR_THROW_IF(mask_len.batch() != batch(), std::invalid_argument)
    1294              :     << "Number of filter masks mismatched";
    1295              : 
    1296            0 :   for (unsigned int b = 0; b < batch(); b++) {
    1297            0 :     float *addr = (float *)getAddress(getIndex(b, 0, 0, 0));
    1298              :     const unsigned int *mask_len_val =
    1299            0 :       mask_len.getAddress<unsigned int>(b, 0, 0, 0);
    1300            0 :     std::fill(addr, addr + (*mask_len_val), en_mask_val);
    1301              :   }
    1302            0 : }
    1303              : 
    1304            3 : void FloatTensor::zoneout_mask(Tensor &opposite, float zoneout) {
    1305            3 :   opposite.setRandBernoulli(zoneout);
    1306              : 
    1307            3 :   float *data = (float *)getData();
    1308              :   float *opposite_data = opposite.getData<float>();
    1309              : 
    1310      2010003 :   for (unsigned int i = 0; i < size(); ++i) {
    1311      2010000 :     if (opposite_data[i] > epsilon) {
    1312       603513 :       data[i] = 0.0f;
    1313              :     } else {
    1314      1406487 :       data[i] = 1.0f;
    1315              :     }
    1316              :   }
    1317            3 : }
    1318              : 
    1319           13 : std::vector<Tensor> FloatTensor::split(std::vector<size_t> sizes, int axis) {
    1320              :   size_t num_size = sizes.size();
    1321              : 
    1322           13 :   if (axis == -1) {
    1323              :     axis = 3;
    1324              :   }
    1325              : 
    1326              :   size_t total_size =
    1327              :     std::accumulate(sizes.begin(), sizes.end(), static_cast<size_t>(0));
    1328           14 :   NNTR_THROW_IF(dim.getTensorDim(axis) != total_size, std::invalid_argument)
    1329              :     << "given sum of sizes did not match with origin tensor dim, tensor dim: "
    1330            1 :     << dim.getTensorDim(axis) << " total size: " << total_size;
    1331              : 
    1332           12 :   std::vector<TensorDim> ret_dims(num_size, dim);
    1333           43 :   for (unsigned int i = 0; i < num_size; ++i) {
    1334           31 :     ret_dims[i].setTensorDim(axis, sizes[i]);
    1335              :   }
    1336              : 
    1337           12 :   bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
    1338              :   std::vector<Tensor> ret;
    1339              : 
    1340         1248 :   auto iter_value = [this, is_format_nchw](
    1341              :                       std::array<size_t, 4> &loc,
    1342              :                       const std::array<size_t, 4> &end_loc,
    1343              :                       const std::array<size_t, 4> &reset_dim_arr) -> float & {
    1344         1248 :     auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
    1345            0 :                                    : getValue(loc[0], loc[3], loc[1], loc[2]);
    1346         1960 :     for (int i = 3; i >= 0; --i) {
    1347         1929 :       loc[i]++;
    1348         1929 :       if (loc[i] == end_loc[i]) {
    1349          712 :         loc[i] -= reset_dim_arr[i];
    1350              :         continue;
    1351              :       }
    1352              :       break;
    1353              :     }
    1354         1248 :     return value;
    1355           12 :   };
    1356              : 
    1357              :   unsigned int accumulated_size = 0;
    1358           43 :   for (unsigned int i = 0; i < num_size; ++i) {
    1359           31 :     std::array<size_t, 4> loc = {0, 0, 0, 0};
    1360              : 
    1361           31 :     if (is_format_nchw) {
    1362           31 :       loc[axis] += accumulated_size;
    1363              :     } else {
    1364            0 :       if (axis == 0) {
    1365            0 :         loc[0] += accumulated_size;
    1366            0 :       } else if (axis == 1) {
    1367            0 :         loc[3] += accumulated_size;
    1368            0 :       } else if (axis == 2 || axis == 3) {
    1369            0 :         loc[axis - 1] += accumulated_size;
    1370              :       }
    1371              :     }
    1372              : 
    1373           62 :     ret.push_back(Tensor(ret_dims[i]));
    1374              :     auto &ret_t = ret.back();
    1375              : 
    1376              :     std::array<size_t, 4> end_loc;
    1377              : 
    1378           31 :     if (is_format_nchw) {
    1379           62 :       end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
    1380           31 :                  ret_dims[i].height(), ret_dims[i].width()};
    1381              :     } else {
    1382            0 :       end_loc = {ret_dims[i].batch(), ret_dims[i].height(), ret_dims[i].width(),
    1383            0 :                  ret_dims[i].channel()};
    1384              :     }
    1385              : 
    1386           31 :     accumulated_size += sizes[i];
    1387              : 
    1388           31 :     if (is_format_nchw) {
    1389           31 :       end_loc[axis] = accumulated_size;
    1390              :     } else {
    1391            0 :       if (axis == 0) {
    1392            0 :         end_loc[0] = accumulated_size;
    1393            0 :       } else if (axis == 1) {
    1394            0 :         end_loc[3] = accumulated_size;
    1395            0 :       } else if (axis == 2 || axis == 3) {
    1396            0 :         end_loc[axis - 1] = accumulated_size;
    1397              :       }
    1398              :     }
    1399              : 
    1400              :     std::array<size_t, 4> reset_dim_arr;
    1401              :     if (is_format_nchw) {
    1402           62 :       reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
    1403           31 :                        ret_dims[i].height(), ret_dims[i].width()};
    1404              :     } else {
    1405            0 :       reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
    1406            0 :                        ret_dims[i].width(), ret_dims[i].channel()};
    1407              :     }
    1408              : 
    1409           31 :     ret_t.apply_i<float>(
    1410           62 :       [&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
    1411         1248 :         return iter_value(loc, end_loc, reset_dim_arr);
    1412              :       });
    1413              :   }
    1414              : 
    1415           12 :   return ret;
    1416           12 : }
    1417              : 
    1418            5 : Tensor FloatTensor::concat(const std::vector<Tensor> &tensors, int axis,
    1419              :                            Tensor &output) {
    1420            5 :   bool is_format_nchw = (tensors.front().getDim().getFormat() == Tformat::NCHW);
    1421              : 
    1422              :   auto iter_value =
    1423          746 :     [is_format_nchw](std::array<unsigned, 4> &loc,
    1424              :                      const std::array<unsigned, 4> &start_loc, Tensor &t,
    1425              :                      const std::array<unsigned, 4> &ref_dim_arr) -> float & {
    1426          746 :     auto &value = is_format_nchw
    1427          746 :                     ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
    1428            0 :                     : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
    1429              : 
    1430         1044 :     for (int i = 3; i >= 0; --i) {
    1431         1033 :       loc[i]++;
    1432         1033 :       if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
    1433          298 :         loc[i] = start_loc[i];
    1434              :         continue;
    1435              :       }
    1436              :       break;
    1437              :     }
    1438          746 :     return value;
    1439            5 :   };
    1440              : 
    1441            5 :   std::array<unsigned, 4> loc = {0, 0, 0, 0};
    1442           16 :   for (auto &t : tensors) {
    1443           11 :     std::array<unsigned, 4> start_loc = loc;
    1444              :     std::array<unsigned, 4> tensor_dim_arr;
    1445           11 :     TensorDim curr_dim = t.getDim();
    1446              : 
    1447           11 :     tensor_dim_arr[0] = curr_dim.getTensorDim(0);
    1448           22 :     tensor_dim_arr[1] =
    1449           11 :       is_format_nchw ? curr_dim.getTensorDim(1) : curr_dim.getTensorDim(2);
    1450           22 :     tensor_dim_arr[2] =
    1451           11 :       is_format_nchw ? curr_dim.getTensorDim(2) : curr_dim.getTensorDim(3);
    1452           22 :     tensor_dim_arr[3] =
    1453           11 :       is_format_nchw ? curr_dim.getTensorDim(3) : curr_dim.getTensorDim(1);
    1454              : 
    1455          757 :     for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
    1456          746 :       iter_value(loc, start_loc, output, tensor_dim_arr) = t.getValue<float>(i);
    1457              :     }
    1458              : 
    1459           11 :     if (is_format_nchw) {
    1460           11 :       loc[axis] += curr_dim.getTensorDim(axis);
    1461              :     } else {
    1462            0 :       if (axis == 0) {
    1463            0 :         loc[0] += curr_dim.getTensorDim(axis);
    1464            0 :       } else if (axis == 1) {
    1465            0 :         loc[3] += curr_dim.getTensorDim(axis);
    1466            0 :       } else if (axis == 2 || axis == 3) {
    1467            0 :         loc[axis - 1] += curr_dim.getTensorDim(axis);
    1468              :       }
    1469              :     }
    1470              :   }
    1471              : 
    1472            5 :   return output;
    1473              : }
    1474              : 
    1475           10 : void FloatTensor::print(std::ostream &out) const {
    1476           10 :   const float *data = (float *)getData();
    1477           10 :   unsigned int len = size();
    1478           10 :   out << "data addr: " << data << '\n';
    1479           10 :   out << dim;
    1480              : 
    1481           10 :   if (len > 100) {
    1482            6 :     out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
    1483            6 :         << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1] << ']'
    1484              :         << std::endl;
    1485            6 :     return;
    1486              :   }
    1487              : 
    1488            4 :   std::ios init(NULL);
    1489            4 :   init.copyfmt(out);
    1490              : 
    1491            4 :   if (getFormat() == Tformat::NCHW) {
    1492           10 :     for (unsigned int k = 0; k < batch(); k++) {
    1493           12 :       for (unsigned int l = 0; l < channel(); l++) {
    1494           17 :         for (unsigned int i = 0; i < height(); i++) {
    1495           47 :           for (unsigned int j = 0; j < width(); j++) {
    1496              :             out << std::setw(10) << std::setprecision(10)
    1497           36 :                 << data[getIndex(k, l, i, j)] << " ";
    1498              :           }
    1499              :           out << std::endl;
    1500              :         }
    1501              :         out << std::endl;
    1502              :       }
    1503              :       out << "-------" << std::endl;
    1504              :     }
    1505              :   } else {
    1506            0 :     for (unsigned int k = 0; k < batch(); k++) {
    1507            0 :       for (unsigned int i = 0; i < height(); i++) {
    1508            0 :         for (unsigned int j = 0; j < width(); j++) {
    1509            0 :           for (unsigned int l = 0; l < channel(); l++) {
    1510              :             out << std::setw(10) << std::setprecision(10)
    1511            0 :                 << data[getIndex(k, l, i, j)] << " ";
    1512              :           }
    1513              :           out << std::endl;
    1514              :         }
    1515              :         out << std::endl;
    1516              :       }
    1517              :       out << "-------" << std::endl;
    1518              :     }
    1519              :   }
    1520            4 :   out.copyfmt(init);
    1521              : }
    1522              : 
    1523       112088 : void FloatTensor::copy(const void *buf) {
    1524       112088 :   NNTR_THROW_IF(!contiguous, std::invalid_argument)
    1525              :     << getName() << " is not contiguous, cannot copy.";
    1526              : 
    1527       112088 :   if (buf == getData()) {
    1528              :     return;
    1529              :   }
    1530              : 
    1531       111785 :   scopy(size(), (float *)buf, 1, (float *)getData(), 1);
    1532              : }
    1533              : 
    1534       100882 : void FloatTensor::apply_broadcast_util(
    1535              :   Tensor const &m,
    1536              :   std::function<void(const BroadcastInfo &e, const float *, const float *,
    1537              :                      float *)>
    1538              :     v_func,
    1539              :   Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
    1540              :   size_t m_offset) const {
    1541              : 
    1542       100882 :   const float *buf = (float *)this->getData();
    1543              :   const float *m_buf = m.getData<float>();
    1544              :   float *out_buf = output.getData<float>();
    1545              : 
    1546       100882 :   if (e.buffer_axis == cur_axis) {
    1547        77211 :     v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
    1548        77211 :     return;
    1549              :   }
    1550              : 
    1551        23671 :   cur_axis++;
    1552        23671 :   unsigned int continuity[4] = {0, 1, 2, 3};
    1553        23671 :   if (getFormat() == Tformat::NHWC) {
    1554           10 :     continuity[1] = 2;
    1555           10 :     continuity[2] = 3;
    1556           10 :     continuity[3] = 1;
    1557              :   }
    1558       113890 :   for (unsigned int i = 0; i < dim.getTensorDim(continuity[cur_axis]); ++i) {
    1559        90219 :     size_t next_offset = offset + i * strides[cur_axis];
    1560        90219 :     size_t next_m_offset = m_offset + i * e.strides[cur_axis];
    1561       180438 :     apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
    1562              :                          next_m_offset);
    1563              :   }
    1564              : }
    1565              : 
    1566        76795 : void FloatTensor::apply_broadcast(
    1567              :   Tensor const &m,
    1568              :   std::function<void(const BroadcastInfo &e, const float *, const float *,
    1569              :                      float *)>
    1570              :     v_func,
    1571              :   Tensor &output) const {
    1572        91891 :   CREATE_IF_EMPTY_DIMS(output, dim);
    1573              : 
    1574        76799 :   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
    1575              :     << getName() << " is not allocated";
    1576        76795 :   NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
    1577            4 :     << m.getName() << " is not allocated";
    1578        76791 :   NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
    1579            4 :     << output.getName() << " is not allocated";
    1580              : 
    1581              :   /// shortcut to cover when dimension matches
    1582              :   /// note that buffer_size, the last stride is only used in v_func but it
    1583              :   /// might be changed
    1584        76783 :   if (dim == m.getDim()) {
    1585              :     BroadcastInfo e;
    1586        65954 :     e.buffer_size = size();
    1587        65954 :     e.strides[3] = 1;
    1588        65954 :     e.tensor_type = getTensorType();
    1589       131908 :     v_func(e, (float *)getData(), m.getData<float>(), output.getData<float>());
    1590              :     return;
    1591              :   }
    1592              : 
    1593        21492 :   return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
    1594              : }
    1595              : 
    1596           12 : bool FloatTensor::isValid() const {
    1597           12 :   return is_valid(dim.getDataLen(), (float *)getData());
    1598              : }
    1599              : 
    1600              : } // namespace nntrainer

Generated by: LCOV version 2.0-1