LCOV - code coverage report
Current view: top level - nntrainer/tensor - float_tensor.cpp (source / functions) Coverage Total Hit
Test: coverage_filtered.info Lines: 76.9 % 750 577
Test Date: 2026-01-12 20:43:37 Functions: 90.7 % 75 68

            Line data    Source code
       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * @file        float_tensor.cpp
       4              :  * @date        01 December 2023
       5              :  * @brief       This is FloatTensor class for 32-bit floating point calculation
       6              :  * @see         https://github.com/nntrainer/nntrainer
       7              :  * @author      Jijoong Moon <jijoong.moon@samsung.com>
       8              :  * @author      Donghyeon Jeong <dhyeon.jeong@samsung.com>
       9              :  * @bug         No known bugs except for NYI items
      10              :  */
      11              : 
      12              : #include <iomanip>
      13              : #include <iostream>
      14              : #include <numeric>
      15              : 
      16              : #include <chrono>
      17              : #include <cpu_backend.h>
      18              : #include <float_tensor.h>
      19              : #include <int4_tensor.h>
      20              : #include <q4_0_utils.h>
      21              : 
      22              : #include <tensor.h>
      23              : #include <util_func.h>
      24              : 
      25              : #ifdef ENABLE_OPENCL
      26              : #include "blas_kernels.h"
      27              : #endif
      28              : 
      29              : namespace nntrainer {
      30              : 
      31       282558 : FloatTensor::FloatTensor(std::string name_, Tformat fm) :
      32       565116 :   TensorBase(name_, fm, Tdatatype::FP32) {}
      33              : 
      34       375564 : FloatTensor::FloatTensor(const TensorDim &d, bool alloc_now, Initializer init,
      35       375564 :                          std::string name) :
      36       375564 :   TensorBase(d, alloc_now, init, name) {
      37       375564 :   if (alloc_now)
      38       341393 :     allocate();
      39       375564 : }
      40              : 
      41       335982 : FloatTensor::FloatTensor(const TensorDim &d, const void *buf) :
      42       335982 :   FloatTensor(d, true) {
      43       335982 :   if (d.getDataLen() != 0) {
      44       335982 :     if (buf != nullptr)
      45        13863 :       copy(buf);
      46              :   }
      47       335982 : }
      48              : 
      49        10794 : bool FloatTensor::operator==(const FloatTensor &rhs) const {
      50        10794 :   const float *_data = (float *)getData();
      51        10794 :   const float *_rdata = (float *)rhs.getData();
      52      1403019 :   for (size_t i = 0; i < size(); ++i) {
      53              :     /** not checking sign change is intentional to avoid float calculation
      54              :      * errors around 0 */
      55      1392265 :     if (std::isnan(_data[i]) || std::isnan(_rdata[i]) ||
      56      1392264 :         std::fabs(_data[i] - _rdata[i]) > epsilon)
      57              :       return false;
      58              :   }
      59              : 
      60              :   return true;
      61              : }
      62              : 
      63              : /// @todo support allocation by src_tensor
      64       643260 : void FloatTensor::allocate() {
      65       643260 :   if (empty() || data)
      66              :     return;
      67              : 
      68       643250 :   if (src_tensor) {
      69              :     /// allocate data based on the source tensor
      70       301850 :     allocateSrcTensor();
      71              :     /** as this memory is shared, do NOT initialize */
      72              :   } else {
      73              :     /// allocate new memory for the tensor data
      74              :     MemoryData *mem_data;
      75              : 
      76   4522423560 :     mem_data = new MemoryData((void *)(new float[dim.getDataLen()]{}));
      77       341400 :     data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
      78       341400 :       delete[] mem_data->template getAddr<float>();
      79       341400 :       delete mem_data;
      80              :     });
      81              : 
      82       341400 :     offset = 0;
      83       341400 :     initialize();
      84              :   }
      85              : }
      86              : 
      87          906 : void FloatTensor::deallocate() {
      88              :   data = nullptr;
      89          906 :   offset = 0;
      90          906 : }
      91              : 
      92    136172133 : void *FloatTensor::getData() const {
      93    136172133 :   if (!data)
      94              :     return nullptr;
      95              : 
      96              :   data->validate();
      97    136170514 :   return data->getAddr<float>() + offset;
      98              : }
      99              : 
     100            0 : void *FloatTensor::getData(size_t idx) const {
     101            0 :   if (!data)
     102              :     return nullptr;
     103              : 
     104              :   data->validate();
     105            0 :   return data->getAddr<float>() + offset + idx;
     106              : }
     107              : 
     108      5427307 : void *FloatTensor::getAddress(unsigned int i) {
     109      5427307 :   size_t index = getIndex(batch(), channel(), height(), width());
     110      5427307 :   if (i > index) {
     111              :     return nullptr;
     112              :   }
     113      5427307 :   return &((float *)getData())[i];
     114              : }
     115              : 
     116       148414 : const void *FloatTensor::getAddress(unsigned int i) const {
     117       148414 :   size_t index = getIndex(batch(), channel(), height(), width());
     118       148414 :   if (i > index) {
     119              :     return nullptr;
     120              :   }
     121       148414 :   return &((float *)getData())[i];
     122              : }
     123              : 
     124         4584 : const float &FloatTensor::getValue(unsigned int i) const {
     125         4584 :   return ((float *)getData())[i];
     126              : }
     127              : 
     128         1291 : float &FloatTensor::getValue(unsigned int i) { return ((float *)getData())[i]; }
     129              : 
     130         4584 : const float &FloatTensor::getValue(unsigned int b, unsigned int c,
     131              :                                    unsigned int h, unsigned int w) const {
     132         4584 :   return getValue(getIndex(b, c, h, w));
     133              : }
     134              : 
     135         1260 : float &FloatTensor::getValue(unsigned int b, unsigned int c, unsigned int h,
     136              :                              unsigned int w) {
     137         1260 :   return getValue(getIndex(b, c, h, w));
     138              : }
     139              : 
     140        81317 : void FloatTensor::setValue(float value) {
     141        81317 :   float *data = (float *)getData();
     142        81317 :   std::fill(data, data + size(), value);
     143        81317 : }
     144              : 
     145     35425636 : void FloatTensor::setValue(unsigned int b, unsigned int c, unsigned int h,
     146              :                            unsigned int w, float value) {
     147     35425636 :   ((float *)getData())[getIndex(b, c, h, w)] = value;
     148     35425636 : }
     149              : 
     150        10913 : void FloatTensor::addValue(unsigned int b, unsigned int c, unsigned int h,
     151              :                            unsigned int w, float value, float beta) {
     152        10913 :   auto const &idx = getIndex(b, c, h, w);
     153        10913 :   ((float *)getData())[idx] *= beta;
     154        10913 :   ((float *)getData())[idx] += value;
     155        10913 : }
     156              : 
     157        42378 : void FloatTensor::setZero() {
     158        42378 :   if (contiguous) {
     159              :     //  sscal(size(), 0, getData<float>(), 1);
     160              :     /// @note we cannot use sscal, when we set zero. if the data is inf or
     161              :     /// NaN, then the inf or NaN still remain.
     162        42378 :     memset((float *)getData(), 0, sizeof(float) * size());
     163              :   } else {
     164              :     /// @todo implement apply_i
     165              :     // apply_i<float>([](float val) -> float { return 0; });
     166            0 :     setValue(0);
     167              :   }
     168        42378 : }
     169              : 
     170          110 : void FloatTensor::setRandNormal(float mean, float stddev) {
     171          110 :   setDist<std::normal_distribution<float>>(
     172              :     std::normal_distribution<float>(mean, stddev));
     173          110 : }
     174              : 
     175        18101 : void FloatTensor::setRandUniform(float min, float max) {
     176        18101 :   setDist<std::uniform_real_distribution<float>>(
     177              :     std::uniform_real_distribution<float>(min, max));
     178        18101 : }
     179              : 
     180            3 : void FloatTensor::setRandBernoulli(float probability) {
     181            3 :   setDist<std::bernoulli_distribution>(
     182              :     std::bernoulli_distribution(probability));
     183            3 : }
     184              : 
     185       360589 : void FloatTensor::initialize() {
     186       360589 :   if (empty() || !isAllocated())
     187              :     return;
     188              : 
     189              :   unsigned int fan_in, fan_out;
     190              : 
     191              :   /// @fixme: when unit is equal to one, this does not work, we need to rely on
     192              :   /// effective dimension then actual numbers here. For now, some heuristics
     193              :   /// added to infer what would be fan_in/fan_out
     194       360588 :   if (dim.batch() * dim.channel() * dim.height() == 1) {
     195       233471 :     fan_out = fan_in = dim.width();
     196       127117 :   } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
     197        12222 :     fan_in = dim.height();
     198        12222 :     fan_out = dim.width();
     199              :   } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
     200       114895 :     auto field_size = dim.height() * dim.width();
     201              : 
     202              :     // this also handles below cases.
     203              :     // 1. fan_in = fan_out = 1 as well.
     204              :     // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
     205       114895 :     fan_in = dim.channel() * field_size;
     206       114895 :     fan_out = dim.batch() * field_size;
     207              :   }
     208              : 
     209       360588 :   switch (initializer) {
     210         5666 :   case Initializer::ZEROS:
     211         5666 :     setZero();
     212         5666 :     break;
     213          242 :   case Initializer::ONES:
     214          242 :     setValue(1.0f);
     215          242 :     break;
     216            0 :   case Initializer::LECUN_NORMAL:
     217            0 :     setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
     218            0 :     break;
     219            0 :   case Initializer::XAVIER_NORMAL:
     220            0 :     setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
     221            0 :     break;
     222            2 :   case Initializer::HE_NORMAL:
     223            2 :     setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
     224            2 :     break;
     225            0 :   case Initializer::LECUN_UNIFORM:
     226            0 :     setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
     227            0 :     break;
     228         2294 :   case Initializer::XAVIER_UNIFORM:
     229         2294 :     setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
     230         2294 :                    sqrtFloat(6.0 / (fan_in + fan_out)));
     231         2294 :     break;
     232            0 :   case Initializer::HE_UNIFORM:
     233            0 :     setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
     234            0 :                    sqrtFloat(6.0 / (fan_in)));
     235              :   default:
     236              :     break;
     237              :   }
     238              : 
     239       360588 :   putData();
     240              : }
     241              : 
     242            7 : void FloatTensor::initialize(Initializer init) {
     243            7 :   initializer = init;
     244            7 :   initialize();
     245            7 : }
     246              : 
     247        70014 : Tensor &FloatTensor::apply(std::function<float(float)> f,
     248              :                            Tensor &output) const {
     249        83169 :   CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
     250              : 
     251        70014 :   if (contiguous && output.getContiguous()) {
     252        64137 :     const float *data = (float *)getData();
     253              :     float *rdata = output.getData<float>();
     254              : 
     255       128274 :     std::transform(data, data + size(), rdata, f);
     256         5877 :   } else if (strides[3] == 1 && output.getStrides()[3] == 1) {
     257              :     /** @todo optimize this with combining these loops where stride is 1 */
     258        12768 :     for (unsigned int b = 0; b < batch(); ++b) {
     259        13782 :       for (unsigned int c = 0; c < channel(); ++c) {
     260        13800 :         for (unsigned int h = 0; h < height(); ++h) {
     261         6909 :           float *out_data = output.getAddress<float>(b, c, h, 0);
     262         6909 :           const float *in_data = (float *)getAddress(getIndex(b, c, h, 0));
     263        20727 :           std::transform(in_data, in_data + width(), out_data, f);
     264              :         }
     265              :       }
     266              :     }
     267              :   } else {
     268            0 :     for (unsigned int b = 0; b < batch(); ++b) {
     269            0 :       for (unsigned int c = 0; c < channel(); ++c) {
     270            0 :         for (unsigned int h = 0; h < height(); ++h) {
     271            0 :           for (unsigned int w = 0; w < width(); ++w) {
     272            0 :             output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
     273              :           }
     274              :         }
     275              :       }
     276              :     }
     277              :   }
     278              : 
     279        70014 :   return output;
     280              : }
     281              : 
     282        12608 : Tensor FloatTensor::multiply_strided(Tensor const &m, Tensor &output,
     283              :                                      const float beta) const {
     284        12649 :   CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
     285              : 
     286        12608 :   if (size() != m.size() || size() != output.size())
     287              :     throw std::invalid_argument(
     288            3 :       "Strided multiplication does not support broadcasting");
     289              : 
     290        12606 :   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
     291              :     << getName() << " is not allocated";
     292        12605 :   NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
     293            1 :     << m.getName() << " is not allocated";
     294        12604 :   NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
     295            1 :     << output.getName() << " is not allocated";
     296              : 
     297        12602 :   if (strides[3] != 1 || m.getStrides()[3] != 1 ||
     298        37806 :       output.getStrides()[3] != 1 || std::fpclassify(beta) != FP_ZERO) {
     299         2834 :     for (unsigned int b = 0; b < batch(); ++b) {
     300         3048 :       for (unsigned int c = 0; c < channel(); ++c) {
     301         3054 :         for (unsigned int h = 0; h < height(); ++h) {
     302         5655 :           for (unsigned int w = 0; w < width(); ++w) {
     303         4125 :             output.addValue(
     304         4125 :               b, c, h, w, getValue(b, c, h, w) * m.getValue<float>(b, c, h, w),
     305              :               beta);
     306              :           }
     307              :         }
     308              :       }
     309              :     }
     310              :   } else {
     311              :     /** @todo optimize by combining these loops where stride is 1 */
     312        11292 :     if (getFormat() == Tformat::NCHW) {
     313        29292 :       for (unsigned int b = 0; b < batch(); ++b) {
     314        44925 :         for (unsigned int c = 0; c < channel(); ++c) {
     315       167899 :           for (unsigned int h = 0; h < height(); ++h) {
     316       140974 :             float *out_data = output.getAddress<float>(b, c, h, 0);
     317       140974 :             const float *m_data = m.getAddress<float>(b, c, h, 0);
     318       140974 :             const float *in_data = (float *)getAddress(getIndex(b, c, h, 0));
     319       140974 :             std::transform(in_data, in_data + width(), m_data, out_data,
     320              :                            std::multiplies<float>());
     321              :           }
     322              :         }
     323              :       }
     324              :     } else {
     325            0 :       for (unsigned int b = 0; b < batch(); ++b) {
     326            0 :         for (unsigned int h = 0; h < height(); ++h) {
     327            0 :           for (unsigned int w = 0; w < width(); ++w) {
     328            0 :             float *out_data = output.getAddress<float>(b, 0, h, w);
     329            0 :             const float *m_data = m.getAddress<float>(b, 0, h, w);
     330            0 :             const float *in_data = (float *)getAddress(getIndex(b, 0, h, w));
     331            0 :             std::transform(in_data, in_data + channel(), m_data, out_data,
     332              :                            std::multiplies<float>());
     333              :           }
     334              :         }
     335              :       }
     336              :     }
     337              :   }
     338              : 
     339        12602 :   return output;
     340              : }
     341              : 
     342         2609 : int FloatTensor::multiply_i(float const &value) {
     343         2609 :   float *data = (float *)getData();
     344         2609 :   unsigned int len = size();
     345              : 
     346         2609 :   sscal(len, value, data, 1);
     347              : 
     348         2609 :   return ML_ERROR_NONE;
     349              : }
     350              : 
     351         7691 : Tensor &FloatTensor::multiply(float const &value, Tensor &out) const {
     352              :   auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
     353         7691 :   apply(f, out);
     354         7691 :   return out;
     355              : }
     356              : 
     357        17213 : Tensor &FloatTensor::multiply(Tensor const &m, Tensor &output,
     358              :                               const float beta) const {
     359              :   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
     360              :                float *out_buf) {
     361        29863 :     ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
     362        29863 :             strides[3]);
     363              :   };
     364              : 
     365        17213 :   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
     366              :     << "Tensor Format of " << getName() << ":"
     367              :     << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
     368            0 :     << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
     369              : 
     370        17213 :   NNTR_THROW_IF(!contiguous || !m.getContiguous() || !output.getContiguous(),
     371              :                 std::invalid_argument)
     372              :     << getName() << " is not contiguous, cannot multiply";
     373              : 
     374        17213 :   NNTR_THROW_IF(!contiguous || !m.getContiguous() || !output.getContiguous(),
     375              :                 std::invalid_argument)
     376              :     << getName() << " is not contiguous, cannot multiply";
     377              : 
     378        17213 :   apply_broadcast(m, f, output);
     379        17074 :   return output;
     380              : }
     381              : 
     382         6273 : Tensor &FloatTensor::divide(float const &value, Tensor &output) const {
     383              :   auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
     384         6273 :   apply(f, output);
     385         6273 :   return output;
     386              : }
     387              : 
     388          198 : Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const {
     389              :   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
     390              :                float *out_buf) {
     391          333 :     ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
     392              :   };
     393              : 
     394          198 :   apply_broadcast(m, f, output);
     395          190 :   return output;
     396              : }
     397              : 
     398          211 : Tensor &FloatTensor::add_strided(Tensor const &input, Tensor &output,
     399              :                                  const float beta) const {
     400          212 :   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
     401              :     << getName() << " is not allocated";
     402          210 :   NNTR_THROW_IF(input.getData<float>() == nullptr, std::invalid_argument)
     403            0 :     << input.getName() << " is not allocated";
     404          211 :   NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
     405            1 :     << output.getName() << " is not allocated";
     406              : 
     407          209 :   if (strides[3] != 1 || input.getStrides()[3] != 1 ||
     408          627 :       output.getStrides()[3] != 1 || std::fpclassify(beta) != FP_ZERO) {
     409          112 :     for (unsigned int b = 0; b < batch(); ++b) {
     410          174 :       for (unsigned int c = 0; c < channel(); ++c) {
     411          198 :         for (unsigned int h = 0; h < height(); ++h) {
     412          567 :           for (unsigned int w = 0; w < width(); ++w) {
     413          459 :             output.setValue(b, c, h, w,
     414          459 :                             getValue(b, c, h, w) +
     415          459 :                               input.getValue<float>(b, c, h, w) * beta);
     416              :           }
     417              :         }
     418              :       }
     419              :     }
     420              :   } else {
     421              :     /** @todo optimize this with combining these loops where stride is 1 */
     422          181 :     if (this->getFormat() == Tformat::NCHW) {
     423          616 :       for (unsigned int b = 0; b < batch(); ++b) {
     424          876 :         for (unsigned int c = 0; c < channel(); ++c) {
     425          972 :           for (unsigned int h = 0; h < height(); ++h) {
     426          531 :             float *out_data = output.getAddress<float>(b, c, h, 0);
     427          531 :             const float *in_data = input.getAddress<float>(b, c, h, 0);
     428          531 :             const float *_data = (float *)getAddress(getIndex(b, c, h, 0));
     429          531 :             std::transform(_data, _data + width(), in_data, out_data,
     430              :                            std::plus<float>());
     431              :           }
     432              :         }
     433              :       }
     434              :     } else {
     435            0 :       for (unsigned int b = 0; b < batch(); ++b) {
     436            0 :         for (unsigned int h = 0; h < height(); ++h) {
     437            0 :           for (unsigned int w = 0; w < width(); ++w) {
     438            0 :             float *out_data = output.getAddress<float>(b, 0, h, w);
     439            0 :             const float *in_data = input.getAddress<float>(b, 0, h, w);
     440            0 :             const float *_data = (float *)getAddress(getIndex(b, 0, h, w));
     441            0 :             std::transform(_data, _data + channel(), in_data, out_data,
     442              :                            std::plus<float>());
     443              :           }
     444              :         }
     445              :       }
     446              :     }
     447              :   }
     448              : 
     449          209 :   return output;
     450              : }
     451              : 
     452         2698 : int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
     453              :                                Tensor &m, unsigned int incX, unsigned int incY,
     454              :                                const Tensor alphas, unsigned int alpha_idx) {
     455         2698 :   saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
     456         2698 :         (float *)getAddress(addr_idx), incY);
     457              : 
     458         2698 :   return ML_ERROR_NONE;
     459              : }
     460              : 
     461         7432 : Tensor &FloatTensor::add(float const &value, Tensor &output) const {
     462              :   auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
     463         7432 :   apply(f, output);
     464         7432 :   return output;
     465              : }
     466              : 
     467        59384 : Tensor &FloatTensor::add(Tensor const &m, Tensor &output,
     468              :                          float const alpha) const {
     469              :   auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
     470              :                float *out_buf) {
     471       112969 :     ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
     472       112969 :             strides[3]);
     473              :   };
     474        59384 :   apply_broadcast(m, f, output);
     475        59353 :   return output;
     476              : }
     477              : 
     478          174 : Tensor &FloatTensor::subtract(float const &value, Tensor &output) const {
     479              :   auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
     480          174 :   apply(f, output);
     481          174 :   return output;
     482              : }
     483              : 
     484          453 : void FloatTensor::sum_by_batch(Tensor &output) const {
     485          453 :   size_t feat_len = dim.getFeatureLen();
     486          453 :   size_t batch = dim.batch();
     487              : 
     488          453 :   const float *data = (float *)getData();
     489              :   float *out_data = output.getData<float>();
     490              : 
     491          453 :   Tensor ones(1, 1, 1, feat_len, this->getFormat());
     492          453 :   ones.setValue(1.0);
     493          453 :   sgemv((unsigned int)dim.getStorageOrder(), false, (int)batch, (int)feat_len,
     494              :         1, data, (int)feat_len, ones.getData<float>(), 1, 0.0, out_data, 1);
     495          453 : }
     496              : 
     497       126666 : Tensor &FloatTensor::sum(unsigned int axis, Tensor &output, float alpha,
     498              :                          float beta) const {
     499       126666 :   const float *data = (float *)getData();
     500              : 
     501       126666 :   NNTR_THROW_IF(!contiguous, std::invalid_argument)
     502              :     << getName() << " is not contiguous, cannot sum";
     503              : 
     504       126666 :   if (axis >= 4)
     505            3 :     throw std::out_of_range("Error: axis is invalid");
     506              : 
     507       126663 :   if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
     508       111118 :     CREATE_IF_EMPTY_DIMS(output, dim);
     509        58861 :     scopy(size(), (float *)getData(), 1, output.getData<float>(), 1);
     510        58861 :     return output;
     511              :   }
     512              : 
     513        67802 :   switch (axis) {
     514         1654 :   case 0: {
     515         1939 :     CREATE_IF_EMPTY_DIMS(output, 1, dim.channel(), dim.height(), dim.width(),
     516              :                          getTensorType());
     517         1654 :     size_t feat_len = dim.getFeatureLen();
     518         1654 :     size_t batch = dim.batch();
     519         1654 :     Tensor ones(1, 1, 1, batch, getTensorType());
     520         1654 :     ones.setValue(alpha);
     521         1654 :     sgemv((unsigned int)dim.getStorageOrder(), true, (int)batch, (int)feat_len,
     522              :           1, data, (int)feat_len, ones.getData<float>(), 1, beta,
     523              :           output.getData<float>(), 1);
     524         1654 :   } break;
     525           64 :   case 1: {
     526           90 :     CREATE_IF_EMPTY_DIMS(output, dim[0], 1, dim[2], dim[3], getTensorType());
     527           64 :     if (this->getFormat() == Tformat::NHWC) {
     528            0 :       unsigned int feat_len = output.getDim().getDataLen();
     529            0 :       unsigned int t_axis = dim[1];
     530            0 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     531            0 :       ones.setValue(alpha);
     532            0 :       sgemv((unsigned int)dim.getStorageOrder(), false, (int)feat_len,
     533              :             (int)t_axis, 1, data, (int)t_axis, ones.getData<float>(), 1, beta,
     534              :             output.getData<float>(), 1);
     535            0 :     } else {
     536           64 :       unsigned int feat_len = dim[2] * dim[3];
     537           64 :       unsigned int t_axis = dim[1];
     538           64 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     539           64 :       ones.setValue(alpha);
     540              :       float *rdata = output.getData<float>();
     541          218 :       for (unsigned int k = 0; k < dim[0]; ++k) {
     542          154 :         sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     543          154 :               (int)feat_len, 1, &data[k * dim.getFeatureLen()], (int)feat_len,
     544          154 :               ones.getData<float>(), 1, beta, &rdata[k * feat_len], 1);
     545              :       }
     546           64 :     }
     547              :   } break;
     548         6658 :   case 2: {
     549        13211 :     CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], 1, dim[3], getTensorType());
     550              :     if (this->getFormat() == Tformat::NHWC) {
     551            0 :       unsigned int feat_len = dim[1] * dim[3];
     552            0 :       unsigned int t_axis = dim[2];
     553            0 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     554            0 :       ones.setValue(alpha);
     555              :       float *rdata = output.getData<float>();
     556            0 :       for (unsigned int k = 0; k < dim[0]; ++k) {
     557            0 :         sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     558            0 :               (int)feat_len, 1, &data[k * dim.getFeatureLen()], (int)feat_len,
     559            0 :               ones.getData<float>(), 1, beta, &rdata[k * feat_len], 1);
     560              :       }
     561            0 :     } else {
     562         6658 :       unsigned int t_3 = dim[3];
     563         6658 :       unsigned int t_axis = dim[2];
     564         6658 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     565         6658 :       ones.setValue(alpha);
     566              : 
     567         6658 :       if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
     568              :         float *rdata = output.getData<float>();
     569        13364 :         for (unsigned int k = 0; k < dim[0]; ++k) {
     570        13516 :           for (unsigned int c = 0; c < dim[1]; ++c) {
     571         6810 :             unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
     572              :             unsigned int ridx =
     573         6810 :               k * output.getDim().getFeatureLen() + c * dim[3];
     574              : 
     575        13620 :             sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     576         6810 :                   (int)t_3, 1, &data[idx], (int)t_3, ones.getData<float>(), 1,
     577         6810 :                   beta, &rdata[ridx], 1);
     578              :           }
     579              :         }
     580              :       } else {
     581            0 :         sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     582            0 :               (int)output.getDim().getDataLen(), 1, data, (int)t_axis,
     583              :               ones.getData<float>(), 1, beta, output.getData<float>(), 1);
     584              :       }
     585         6658 :     }
     586              :   } break;
     587        59426 :   case 3: {
     588       117785 :     CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], dim[2], 1,
     589              :                          this->getTensorType());
     590        59426 :     if (this->getFormat() == Tformat::NHWC) {
     591            0 :       unsigned int t_3 = dim[1];
     592            0 :       unsigned int t_axis = dim[3];
     593            0 :       Tensor ones(1, 1, 1, t_axis, getTensorType());
     594            0 :       ones.setValue(alpha);
     595              :       float *rdata = output.getData<float>();
     596            0 :       for (unsigned int k = 0; k < dim[0]; ++k) {
     597            0 :         for (unsigned int c = 0; c < dim[2]; ++c) {
     598            0 :           unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
     599            0 :           unsigned int ridx = k * output.getDim().getFeatureLen() + c * dim[1];
     600            0 :           sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
     601            0 :                 (int)t_3, 1, &data[idx], (int)t_3, ones.getData<float>(), 1,
     602            0 :                 beta, &rdata[ridx], 1);
     603              :         }
     604              :       }
     605            0 :     } else {
     606        59426 :       unsigned int m = output.getDim().getDataLen();
     607        59426 :       unsigned int n = dim[3];
     608        59426 :       Tensor ones(1, 1, 1, n, getTensorType());
     609        59426 :       ones.setValue(alpha);
     610              : 
     611        59426 :       if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
     612        59426 :         sgemv((unsigned int)dim.getStorageOrder(), false, (int)m, (int)n, 1,
     613              :               data, (int)n, ones.getData<float>(), 1, beta,
     614              :               output.getData<float>(), 1);
     615              :       } else {
     616              :         float *rdata = output.getData<float>();
     617              : 
     618            0 :         for (unsigned int k = 0; k < dim[0]; ++k) {
     619            0 :           for (unsigned int c = 0; c < dim[1]; ++c) {
     620            0 :             unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
     621            0 :             unsigned int ridx = k * dim[1] * dim[2] + c * dim[2];
     622              : 
     623            0 :             sgemv((unsigned int)dim.getStorageOrder(), false, (int)dim[2],
     624            0 :                   (int)n, 1, &data[idx], (int)dim[2], ones.getData<float>(), 1,
     625            0 :                   beta, &rdata[ridx], 1);
     626              :           }
     627              :         }
     628              :       }
     629        59426 :     }
     630              :   } break;
     631              :   default:
     632              :     throw std::out_of_range("Error: Dimension cannot exceed 3");
     633              :   }
     634              : 
     635              :   return output;
     636              : }
     637              : 
     638            0 : Tensor &FloatTensor::abs(Tensor &output) const {
     639              :   auto f = [](float in) { return std::abs(in); };
     640            0 :   apply(f, output);
     641            0 :   return output;
     642              : }
     643              : 
     644         1931 : float FloatTensor::l2norm() const {
     645         1931 :   return snrm2(size(), (float *)getData(), 1);
     646              : }
     647              : 
     648            8 : void FloatTensor::normalization_i(unsigned int dim, float p, float epsilon) {
     649            8 :   NNTR_THROW_IF(!contiguous, std::invalid_argument)
     650              :     << getName() << " is not contiguous, cannot do normalization.";
     651              : 
     652           10 :   NNTR_THROW_IF(p != 2.0f, std::invalid_argument)
     653              :     << "Only L2 norm (p=2.0) is supported currently";
     654              : 
     655            6 :   float *data = (float *)getData();
     656            6 :   size_t dim_size = this->dim.getTensorDim(dim);
     657            6 :   size_t stride = strides[dim];
     658              : 
     659            6 :   if (dim == 3 && stride == 1) {
     660            3 :     size_t total_elements = size();
     661            3 :     int num_vectors = static_cast<int>(total_elements / dim_size);
     662              : 
     663            3 : #pragma omp parallel for
     664              :     for (int i = 0; i < num_vectors; ++i) {
     665              :       float *vec_ptr = data + i * dim_size;
     666              :       float norm = snrm2(dim_size, vec_ptr, 1);
     667              :       float scale = 1.0f / std::max(norm, epsilon);
     668              :       sscal(dim_size, scale, vec_ptr, 1);
     669              :     }
     670              :   } else {
     671              :     throw nntrainer::exception::not_supported(
     672              :       "FloatTensor::normalization_i currently only optimizes for the last "
     673            6 :       "dimension (dim=3) with stride 1.");
     674              :   }
     675            3 : }
     676              : 
     677          790 : Tensor &FloatTensor::pow(float exponent, Tensor &output) const {
     678       128994 :   auto f = [exponent](float in) { return powf(in, exponent); };
     679          790 :   apply(f, output);
     680          790 :   return output;
     681              : }
     682              : 
     683            6 : Tensor &FloatTensor::sqrt(Tensor &output) const {
     684              :   auto f = [](float in) { return std::sqrt(in); };
     685            6 :   apply(f, output);
     686            6 :   return output;
     687              : }
     688              : 
     689            1 : Tensor &FloatTensor::erf(Tensor &output) const {
     690              :   auto f = [](float in) { return std::erf(in); };
     691            1 :   apply(f, output);
     692            1 :   return output;
     693              : }
     694              : 
     695           11 : void FloatTensor::sin(Tensor &out, float alpha) {
     696           11 :   if (!contiguous) {
     697           90 :     auto f = [alpha](float val) -> float { return std::sin(alpha * val); };
     698            2 :     apply(f, out);
     699              :   } else {
     700           10 :     sine(size(), (float *)getData(), out.getData<float>(), alpha);
     701              :   }
     702           11 : }
     703              : 
     704           14 : void FloatTensor::cos(Tensor &out, float alpha) {
     705           14 :   if (!contiguous) {
     706           90 :     auto f = [alpha](float val) -> float { return std::cos(alpha * val); };
     707            2 :     apply(f, out);
     708              :   } else {
     709           13 :     cosine(size(), (float *)getData(), out.getData<float>(), alpha);
     710              :   }
     711           14 : }
     712              : 
     713            6 : void FloatTensor::tan(Tensor &output, float alpha) {
     714           12 :   auto f = [alpha](float val) -> float { return std::tan(alpha * val); };
     715            6 :   apply(f, output);
     716            6 : }
     717              : 
     718            4 : void FloatTensor::inv_sqrt(Tensor &out) {
     719            4 :   apply([](float val) -> float { return 1 / std::sqrt(val); }, out);
     720            4 : }
     721              : 
     722        36830 : Tensor &FloatTensor::dot(Tensor const &input, Tensor &output, bool trans,
     723              :                          bool trans_in, float beta) const {
     724              :   /**
     725              :    * @note FP32.dot(input);
     726              :    * according to the input type, invoked kernels can be varied.
     727              :    */
     728        36830 :   switch (input.getDataType()) {
     729              :   /** applying sgemm/sgemv after type casting to FP32 */
     730        36824 :   case Tdatatype::FP32:
     731        36824 :     dotFloat(input, output, trans, trans_in, beta);
     732        36821 :     break;
     733            0 :   case Tdatatype::FP16:
     734            0 :     dotFloat32Float16(input, output, trans, trans_in, beta);
     735            0 :     break;
     736              :   /** applying gemm_q4_k / gemm_q6_k / gemm_q4_0 */
     737            6 :   case Tdatatype::Q4_K:
     738              :   case Tdatatype::Q6_K:
     739              :   case Tdatatype::Q4_0:
     740            6 :     dotQnK(input, output, trans, trans_in, beta, input.getDataType());
     741            6 :     break;
     742            0 :   case Tdatatype::QINT16:
     743              :   case Tdatatype::QINT8:
     744              :   case Tdatatype::QINT4:
     745            0 :     dotQInteger(input, output, trans, trans_in, beta, input.getDataType());
     746            0 :     break;
     747            0 :   default:
     748            0 :     throw std::invalid_argument("Error: unsupported datatype");
     749              :   }
     750        36827 :   return output;
     751              : }
     752              : 
     753            0 : void FloatTensor::dot(std::vector<Tensor *> input, std::vector<Tensor *> output,
     754              :                       bool trans, bool trans_in, float beta) const {
     755            0 :   float *data = (float *)getData();
     756            0 :   unsigned int M = getDim().height();
     757            0 :   unsigned int K = getDim().width();
     758            0 :   Tdatatype input_dtype = input[0]->getDataType();
     759              : 
     760              :   // Handle standard inputs
     761            0 :   if (input_dtype != Tdatatype::Q4_0 && input_dtype != Tdatatype::QINT4) {
     762            0 :     for (unsigned int i = 0; i < input.size(); ++i) {
     763            0 :       dot(*input[i], *output[i], trans, trans_in, beta);
     764              :     }
     765            0 :     return;
     766              :   }
     767              : 
     768              :   std::vector<unsigned int> Ns;
     769              :   std::vector<void *> mdatas;
     770              :   std::vector<float *> rdatas;
     771              : 
     772            0 :   for (unsigned int i = 0; i < input.size(); ++i) {
     773            0 :     Ns.push_back(input[i]->getDim().width());
     774            0 :     mdatas.push_back((void *)input[i]->getData<uint8_t>());
     775            0 :     rdatas.push_back(output[i]->getData<float>());
     776              :   }
     777              : 
     778              : #ifdef ENABLE_OPENCL
     779              :   if (input_dtype == Tdatatype::Q4_0) {
     780              :     if (M == 1) {
     781              :       for (unsigned int i = 0; i < input.size(); ++i) {
     782              :         gemm_q4_0(M, Ns[i], K, data, K, mdatas[i], Ns[i], rdatas[i], Ns[i]);
     783              :       }
     784              :     } else {
     785              :       gemm_q4_0_async_cl(mdatas, data, rdatas, M, Ns, K);
     786              :     }
     787              :   } else { // QINT4
     788              :     /// Run on GPU only when memory is a Shared Virual Memory
     789              :     if (input[0]->getMemoryData()->isSVM() &&
     790              :         output[0]->getMemoryData()->isSVM() && getMemoryData()->isSVM()) {
     791              :       std::vector<uint16_t *> scales;
     792              :       for (unsigned int i = 0; i < input.size(); ++i) {
     793              :         scales.push_back(input[i]->getScale<uint16_t>());
     794              :       }
     795              :       if (M == 1) {
     796              :         gemv_int4_async_cl(mdatas, scales, data, rdatas, K, Ns,
     797              :                            Int4QTensor::getGroupSize());
     798              :       } else {
     799              :         gemm_int4_async_cl(data, mdatas, scales, rdatas, M, Ns, K,
     800              :                            Int4QTensor::getGroupSize());
     801              :       }
     802              :     } else {
     803              :       /// @todo This should be replaced with standard CPU INT4 computation
     804              :       for (unsigned int i = 0; i < input.size(); ++i) {
     805              :         gemm_q4_0(M, Ns[i], K, data, K, (void *)input[i]->getData(), Ns[i],
     806              :                   rdatas[i], Ns[i]);
     807              :       }
     808              :     }
     809              :   }
     810              : #else
     811            0 :   if (input_dtype == Tdatatype::Q4_0) {
     812              :     /// @todo Support multi-weight q4_0 for x64
     813            0 :     for (unsigned int i = 0; i < input.size(); ++i) {
     814            0 :       gemm_q4_0(M, Ns[i], K, data, K, mdatas[i], Ns[i], rdatas[i], Ns[i]);
     815              :     }
     816              :   } else { // QINT4
     817              :     /// @note It is essential to understand that this section of the code
     818              :     /// requires the `input` data to be converted to Q4_0 type, not QINT4 type.
     819              :     /// This should be replaced with standard CPU INT4 computation instead of
     820              :     /// using Q4_0.
     821            0 :     for (unsigned int i = 0; i < input.size(); ++i) {
     822            0 :       gemm_q4_0(M, Ns[i], K, data, K, (void *)input[i]->getData(), Ns[i],
     823              :                 rdatas[i], Ns[i]);
     824              :     }
     825              :   }
     826              : #endif
     827            0 : }
     828              : 
     829        36824 : Tensor &FloatTensor::dotFloat(Tensor const &input, Tensor &output, bool trans,
     830              :                               bool trans_in, float beta) const {
     831              :   // Comment out with intension to support the calculation wrt. batch and
     832              :   // height direction. It supposes to have this->dim as [ BxCxH,W ] and
     833              :   // input.dim is [BxCxH,W] as well if (input.dim.rank() > 2) {
     834              :   //   throw exception::not_supported("Error: support only for rank of dot "
     835              :   //                                  "matrix <= 2");
     836              :   // }
     837              : 
     838              :   // Comment out with intension to support the calculation wrt. batch and
     839              :   // height direction of this tensor. It is OK as long as input is 2D
     840        36824 :   if (trans && dim.rank() > 2) {
     841          932 :     ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
     842              :   }
     843              :   unsigned int first_three_flat, last_axis, input_first_three_flat,
     844              :     input_last_axis, M, N, K, lda, ldb, ldc;
     845              : 
     846        36824 :   calculateFlattenDot(input, output, trans, trans_in, first_three_flat,
     847              :                       last_axis, input_first_three_flat, input_last_axis, M, N,
     848              :                       K, lda, ldb, ldc);
     849              : 
     850        36821 :   const float *data = (float *)getData();
     851              :   const float *mdata = input.getData<float>();
     852              :   float *rdata = output.getData<float>();
     853              :   const float alpha = 1.0f;
     854              : 
     855              :   /// shortcut handling in case of vector
     856              :   /// for vector, (1 * K) == (K * 1) in current memory layout...
     857              :   /// and please note that N, K, M is a fixed place holder after considering
     858              :   /// transpose.
     859              :   /// For example, there is no case like (1 * K) X (1 * K) while
     860              :   /// (1 * K) X (1 * M) can be a case
     861              :   /// case1: (1 * K) X (K * 1)
     862        36821 :   if (M == 1 && N == 1) {
     863          182 :     *rdata =
     864          182 :       sdot(K, data, 1, mdata, 1) + ((0.0f == beta) ? 0.0f : beta * *rdata);
     865              :   }
     866              :   /// case2: (M * K) X (K * 1)
     867        36639 :   else if (N == 1) {
     868        10955 :     sgemv((unsigned int)dim.getStorageOrder(), trans, first_three_flat,
     869              :           last_axis, alpha, data, lda, mdata, 1, beta, rdata, 1);
     870              :   }
     871              :   /// case3: (1 * K) X (K * N) = 1 * N = R
     872              :   /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
     873              :   /// Effectively a translation of sgemv
     874        25684 :   else if (M == 1) {
     875         8736 :     sgemv((unsigned int)dim.getStorageOrder(), !trans_in,
     876              :           input_first_three_flat, input_last_axis, alpha, mdata, ldb, data, 1,
     877              :           beta, rdata, 1);
     878              :   }
     879              :   /// case others: use gemm
     880              :   else {
     881        16948 :     sgemm((unsigned int)dim.getStorageOrder(), trans, trans_in, M, N, K, alpha,
     882              :           data, lda, mdata, ldb, beta, rdata, ldc);
     883              :   }
     884              : 
     885        36821 :   return output;
     886              : }
     887              : 
     888            0 : Tensor &FloatTensor::dotFloat32Float16(Tensor const &input, Tensor &output,
     889              :                                        bool trans, bool trans_in,
     890              :                                        float beta) const {
     891              : /// @todo remove #ifdef ENABLE_FP16
     892              : #ifdef ENABLE_FP16
     893              : 
     894              :   // Comment out with intension to support the calculation wrt. batch and
     895              :   // height direction. It supposes to have this->dim as [ BxCxH,W ] and
     896              :   // input.dim is [BxCxH,W] as well if (input.dim.rank() > 2) {
     897              :   //   throw exception::not_supported("Error: support only for rank of dot "
     898              :   //                                  "matrix <= 2");
     899              :   // }
     900              : 
     901              :   // Comment out with intension to support the calculation wrt. batch and
     902              :   // height direction of this tensor. It is OK as long as input is 2D
     903              :   if (trans && dim.rank() > 2) {
     904              :     ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
     905              :   }
     906              :   unsigned int first_three_flat, last_axis, input_first_three_flat,
     907              :     input_last_axis, M, N, K, lda, ldb, ldc;
     908              : 
     909              :   calculateFlattenDot(input, output, trans, trans_in, first_three_flat,
     910              :                       last_axis, input_first_three_flat, input_last_axis, M, N,
     911              :                       K, lda, ldb, ldc);
     912              : 
     913              :   const float *data = (float *)getData();
     914              :   const _FP16 *mdata = input.getData<_FP16>();
     915              :   float *rdata = output.getData<float>();
     916              :   const float alpha = 1.0f;
     917              : 
     918              :   /// shortcut handling in case of vector
     919              :   /// for vector, (1 * K) == (K * 1) in current memory layout...
     920              :   /// and please note that N, K, M is a fixed place holder after considering
     921              :   /// transpose.
     922              :   /// For example, there is no case like (1 * K) X (1 * K) while
     923              :   /// (1 * K) X (1 * M) can be a case
     924              :   /// case1: (1 * K) X (K * 1)
     925              :   NNTR_THROW_IF((M == 1 && N == 1), std::invalid_argument)
     926              :     << "dotQnK does not support trans / trans_in";
     927              :   /// case2: (M * K) X (K * 1)
     928              :   if (N == 1) {
     929              :     shgemv((unsigned int)dim.getStorageOrder(), trans, first_three_flat,
     930              :            last_axis, alpha, data, lda, mdata, 1, beta, rdata, 1);
     931              :   }
     932              :   /// case3: (1 * K) X (K * N) = 1 * N = R
     933              :   /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
     934              :   /// Effectively a translation of sgemv
     935              :   else if (M == 1) {
     936              :     hsgemv((unsigned int)dim.getStorageOrder(), !trans_in,
     937              :            input_first_three_flat, input_last_axis, alpha, mdata, ldb, data, 1,
     938              :            beta, rdata, 1);
     939              :   }
     940              :   /// case others: use gemm
     941              :   else {
     942              :     shgemm((unsigned int)dim.getStorageOrder(), trans, trans_in, M, N, K, alpha,
     943              :            data, lda, mdata, ldb, beta, rdata, ldc);
     944              :   }
     945              : 
     946              :   return output;
     947              : #else
     948            0 :   throw std::invalid_argument("Error: enable-fp16 is not enabled");
     949              : #endif
     950              : }
     951              : 
     952            6 : Tensor &FloatTensor::dotQnK(Tensor const &input, Tensor &output, bool trans,
     953              :                             bool trans_in, float beta, Tdatatype dtype) const {
     954              :   ///@note Be cautious.
     955              :   /// Qn_K does not support transpose in principle.
     956              :   /// This trans option only aims to support Tensor Dimension only,
     957              :   /// not data.
     958              :   ///@note trans is not yet applied
     959            6 :   NNTR_THROW_IF(trans, std::invalid_argument)
     960              :     << "dotQnK does not support trans";
     961              : 
     962            6 :   float *data = (float *)getData();
     963              :   uint8_t *mdata = input.getData<uint8_t>();
     964              :   float *rdata = output.getData<float>();
     965              : 
     966              :   unsigned int M, N, K;
     967            6 :   M = getDim().height();
     968            6 :   K = getDim().width();
     969            6 :   N = trans_in ? input.getDim().height() : input.getDim().width();
     970              : 
     971            6 :   switch (dtype) {
     972            3 :   case Tdatatype::Q4_K:
     973            3 :     gemm_q4_K(M, N, K, data, K, (void *)mdata, N, rdata, N);
     974            3 :     break;
     975            2 :   case Tdatatype::Q6_K:
     976            2 :     gemm_q6_K(M, N, K, data, K, (void *)mdata, N, rdata, N);
     977            2 :     break;
     978            1 :   case Tdatatype::Q4_0:
     979            1 :     M = getDim().height();
     980            1 :     K = getDim().width();
     981            1 :     N = input.getDim().width();
     982              : #ifdef ENABLE_OPENCL
     983              :     if (M == 1) {
     984              :       gemm_q4_0(M, N, K, data, K, (void *)mdata, N, rdata, N);
     985              :     } else {
     986              :       gemm_q4_0_cl((void *)mdata, data, rdata, M, N, K);
     987              :     }
     988              : #else
     989            1 :     gemm_q4_0(M, N, K, data, K, (void *)mdata, N, rdata, N);
     990              : #endif
     991            1 :     break;
     992              : 
     993            0 :   default:
     994            0 :     throw std::invalid_argument("Error: unsupported datatype");
     995              :   }
     996              : 
     997            6 :   return output;
     998              : }
     999              : 
    1000            0 : Tensor &FloatTensor::dotQInteger(Tensor const &input, Tensor &output,
    1001              :                                  bool trans, bool trans_in, float beta,
    1002              :                                  Tdatatype dtype) const {
    1003              : 
    1004            0 :   float *data = (float *)getData();
    1005              :   char *mdata = input.getData<char>();
    1006              :   float *rdata = output.getData<float>();
    1007              : 
    1008            0 :   unsigned int M = getDim().height();
    1009            0 :   unsigned int K = getDim().width();
    1010            0 :   unsigned int N = output.getDim().width();
    1011              : 
    1012              : #ifndef ENABLE_OPENCL
    1013              : #ifdef ENABLE_FP16
    1014              :   if (input.q_scheme() == QScheme::PER_CHANNEL_AFFINE) {
    1015              :     uint32_t opt_kernel_idx = (M == 1) ? 1 : 5;
    1016              :     nntr_gemm_qai8dxp_qsi4cxp_packed(
    1017              :       M, N, K, (void *)data, (void *)mdata, rdata, opt_kernel_idx,
    1018              :       true); /// @todo kernel supports both trans / noTrans situation
    1019              :   } else {
    1020              :     throw std::runtime_error(
    1021              :       "Error: QINT4 Dot on CPU only supports PER_CHANNEL_AFFINE scheme");
    1022              :   }
    1023              : #else
    1024              :   /// @note It is essential to understand that this section of the code requires
    1025              :   /// the `input` data to be converted to Q4_0 type, not QINT4 type. This should
    1026              :   /// be replaced with standard CPU INT4 computation instead of using Q4_0.
    1027            0 :   gemm_q4_0(M, N, K, data, K, (void *)input.getData(), N, rdata, N);
    1028              : #endif
    1029              : #else
    1030              :   if (input.getMemoryData()->isSVM() && output.getMemoryData()->isSVM() &&
    1031              :       getMemoryData()->isSVM()) {
    1032              :     if (M == 1) {
    1033              :       gemv_int4_cl(mdata, input.getScale<uint16_t>(), data, rdata, K, N,
    1034              :                    Int4QTensor::getGroupSize());
    1035              :     } else {
    1036              :       sgemm_int4_cl(data, mdata, input.getScale<uint16_t>(), rdata, M, N, K,
    1037              :                     Int4QTensor::getGroupSize());
    1038              :     }
    1039              :   } else {
    1040              :     /// @todo This should be replaced with standard CPU INT4 computation
    1041              :     gemm_q4_0(M, N, K, data, K, (void *)input.getData(), N, rdata, N);
    1042              :   }
    1043              : #endif
    1044              : 
    1045            0 :   return output;
    1046              : }
    1047              : 
    1048        84504 : void FloatTensor::copy(const Tensor &from) {
    1049        84504 :   reshape(from.getDim());
    1050        84504 :   copy(from.getData<float>());
    1051        84504 : }
    1052              : 
    1053        13730 : void FloatTensor::copyData(const Tensor &from) {
    1054        13730 :   NNTR_THROW_IF(!contiguous, std::invalid_argument)
    1055              :     << getName() << " is not contiguous, cannot copy.";
    1056              : 
    1057        13730 :   NNTR_THROW_IF(size() != from.size(), std::invalid_argument)
    1058              :     << "Size of tensor to copy must match";
    1059              : 
    1060        13730 :   switch (from.getDataType()) {
    1061              :   case ml::train::TensorDim::DataType::FP32:
    1062        13721 :     copy(from.getData<float>());
    1063        13721 :     break;
    1064            0 :   case ml::train::TensorDim::DataType::FP16:
    1065              : /// @todo remove #ifdef ENABLE_FP16
    1066              : #ifdef ENABLE_FP16
    1067              :     scopy(size(), from.getData<_FP16>(), 1, (float *)getData(), 1);
    1068              : #else
    1069            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
    1070              : #endif
    1071              :     break;
    1072            1 :   case ml::train::TensorDim::DataType::QINT16:
    1073            2 :     copy_s16_fp32(from.size(), from.getData<int16_t>(), (float *)getData());
    1074            1 :     break;
    1075            5 :   case ml::train::TensorDim::DataType::QINT8:
    1076            5 :     scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
    1077            5 :                           (float *)getData(), 1);
    1078            5 :     break;
    1079            1 :   case ml::train::TensorDim::DataType::UINT16:
    1080            2 :     copy_u16_fp32(from.size(), from.getData<uint16_t>(), (float *)getData());
    1081            1 :     break;
    1082            2 :   case ml::train::TensorDim::DataType::UINT8:
    1083            2 :     scopy_int8_to_float32(from.size(), from.getData<uint8_t>(), 1,
    1084            2 :                           (float *)getData(), 1);
    1085            2 :     break;
    1086            0 :   default:
    1087              :     throw std::invalid_argument(
    1088            0 :       "[FloatTensor::copyData] Error: Unsupported data type");
    1089              :     break;
    1090              :   }
    1091        13730 : }
    1092              : 
    1093         3509 : void FloatTensor::copy_with_stride(const Tensor &input, Tensor &output) {
    1094         7132 :   for (unsigned int b = 0; b < output.batch(); ++b) {
    1095         7246 :     for (unsigned int c = 0; c < output.channel(); ++c) {
    1096        16688 :       for (unsigned int h = 0; h < output.height(); ++h) {
    1097        91392 :         for (unsigned int w = 0; w < output.width(); ++w) {
    1098        78327 :           output.setValue(b, c, h, w, input.getValue<float>(b, c, h, w));
    1099              :         }
    1100              :       }
    1101              :     }
    1102              :   }
    1103         3509 : }
    1104              : 
    1105          648 : std::vector<unsigned int> FloatTensor::argmax() const {
    1106              :   std::vector<unsigned int> result;
    1107          648 :   const float *data = (float *)getData();
    1108              :   size_t batch_size = batch();
    1109          648 :   size_t feature_len = dim.getFeatureLen();
    1110              : 
    1111          648 :   result.resize(batch_size);
    1112              : 
    1113         8198 :   for (unsigned int b = 0; b < batch_size; b++) {
    1114              :     auto max_iter =
    1115         7550 :       std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
    1116         7550 :     result[b] = std::distance(data, max_iter) - (b * feature_len);
    1117              :   }
    1118          648 :   return result;
    1119            0 : }
    1120              : 
    1121            0 : std::vector<unsigned int> FloatTensor::argmin() const {
    1122              :   std::vector<unsigned int> result;
    1123            0 :   const float *data = (float *)getData();
    1124              :   size_t batch_size = batch();
    1125            0 :   size_t feature_len = dim.getFeatureLen();
    1126              : 
    1127            0 :   result.resize(batch_size);
    1128              : 
    1129            0 :   for (unsigned int b = 0; b < batch_size; b++) {
    1130              :     auto min_iter =
    1131            0 :       std::min_element(data + b * feature_len, data + (b + 1) * feature_len);
    1132            0 :     result[b] = std::distance(data, min_iter) - (b * feature_len);
    1133              :   }
    1134            0 :   return result;
    1135            0 : }
    1136              : 
    1137            6 : void FloatTensor::topK(unsigned int k, void *output_data,
    1138              :                        uint32_t *indices_data) {
    1139              :   const auto &input_dim = getDim();
    1140              :   const Tformat format = input_dim.getFormat();
    1141            6 :   const auto batch = input_dim.batch();
    1142            6 :   const auto channel = input_dim.channel();
    1143            6 :   const auto height = input_dim.height();
    1144            6 :   const auto width = input_dim.width();
    1145              : 
    1146            6 :   if (k == 0 || k > width) {
    1147              :     throw std::invalid_argument(
    1148            0 :       "k must be greater than 0 and less than or equal to width");
    1149              :   }
    1150              : 
    1151              :   float *output_buffer = static_cast<float *>(output_data);
    1152              : 
    1153              :   // Calculate strides for input and output
    1154            6 :   const auto input_strides = input_dim.computeStrides();
    1155            6 :   TensorDim output_dim = input_dim;
    1156            6 :   output_dim.width(k);
    1157            6 :   const auto output_strides = output_dim.computeStrides();
    1158              : 
    1159              : #ifdef _MSC_VER
    1160              : #pragma warning(push)
    1161              : #pragma warning(disable : 4849)
    1162              : #endif
    1163            6 : #pragma omp parallel for collapse(3)
    1164              : #ifdef _MSC_VER
    1165              : #pragma warning(pop)
    1166              : #endif
    1167              :   for (int b = 0; b < static_cast<int>(batch); ++b) {
    1168              :     for (int c = 0; c < static_cast<int>(channel); ++c) {
    1169              :       for (int h = 0; h < static_cast<int>(height); ++h) {
    1170              : 
    1171              :         size_t offset;
    1172              :         if (format == Tformat::NCHW) {
    1173              :           // NCHW: [b][c][h][i]
    1174              :           offset =
    1175              :             b * input_strides[0] + c * input_strides[1] + h * input_strides[2];
    1176              :         } else {
    1177              :           // NHWC: [b][h][i][c]
    1178              :           offset = b * input_strides[0] + h * input_strides[1] + c;
    1179              :         }
    1180              : 
    1181              :         const unsigned int width_stride =
    1182              :           format == Tformat::NHWC ? input_strides[2] : 1;
    1183              :         const float *B = static_cast<const float *>(getData()) + offset;
    1184              :         std::vector<size_t> idx(width);
    1185              :         std::iota(idx.begin(), idx.end(), 0);
    1186              :         std::partial_sort(idx.begin(), idx.begin() + k, idx.end(),
    1187              :                           [&B, width_stride](size_t i1, size_t i2) {
    1188          103 :                             return B[i1 * width_stride] > B[i2 * width_stride];
    1189              :                           });
    1190              : 
    1191              :         // write top-k values and their indices to output
    1192              :         for (unsigned int i = 0; i < k; ++i) {
    1193              :           size_t output_idx;
    1194              :           if (format == Tformat::NCHW) {
    1195              :             // NCHW: [b][c][h][i]
    1196              :             output_idx = b * output_strides[0] + c * output_strides[1] +
    1197              :                          h * output_strides[2] + i;
    1198              :           } else {
    1199              :             // NHWC: [b][h][i][c]
    1200              :             output_idx = b * output_strides[0] + h * output_strides[1] +
    1201              :                          i * output_strides[2] + c;
    1202              :           }
    1203              :           output_buffer[output_idx] = B[idx[i]];
    1204              :           indices_data[output_idx] = static_cast<uint32_t>(idx[i]);
    1205              :         }
    1206              :       }
    1207              :     }
    1208              :   }
    1209            6 : }
    1210              : 
    1211            3 : float FloatTensor::max_abs() const {
    1212            3 :   const float *data = (float *)getData();
    1213            3 :   unsigned int idx = isamax(size(), data, 1);
    1214            3 :   return *(data + idx);
    1215              : }
    1216              : 
    1217          169 : float FloatTensor::maxValue() const {
    1218          169 :   const float *data = (float *)getData();
    1219          169 :   return *std::max_element(data, data + size());
    1220              : }
    1221              : 
    1222          169 : float FloatTensor::minValue() const {
    1223          169 :   const float *data = (float *)getData();
    1224          169 :   return *std::min_element(data, data + size());
    1225              : }
    1226              : 
    1227         1114 : Tensor &FloatTensor::transpose(const std::string &direction,
    1228              :                                Tensor &output) const {
    1229              :   unsigned int SL, SI, SJ, SK;
    1230              : 
    1231         1114 :   output.reshape(dim.transpose(direction));
    1232              : 
    1233         1113 :   int indexI = direction[0] - '0';
    1234         1113 :   int indexJ = direction[2] - '0';
    1235              : 
    1236         1113 :   SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
    1237              : 
    1238              :   bool is_format_nchw = (getFormat() == Tformat::NCHW);
    1239              : 
    1240         1113 :   const float *inptr = (float *)getData();
    1241              :   float *outptr = output.getData<float>();
    1242         1113 :   switch (indexI) {
    1243           17 :   case 0:
    1244           17 :     if (indexJ == 1) {
    1245            2 :       if (is_format_nchw) {
    1246          308 :         transposeloop(l, i, j, k, SL, SI, SJ, SK);
    1247              :       } else {
    1248            0 :         transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
    1249              :       }
    1250              :     } else {
    1251           15 :       if (is_format_nchw) {
    1252           34 :         for (unsigned int b = 0; b < batch(); ++b) {
    1253           44 :           for (unsigned int c = 0; c < channel(); ++c) {
    1254           50 :             transpose_matrix(
    1255           25 :               height(), width(), (float *)getData() + getIndex(b, c, 0, 0),
    1256           25 :               width(), (float *)output.getData() + output.getIndex(b, c, 0, 0),
    1257           25 :               output.width());
    1258              :           }
    1259              :         }
    1260              :       } else {
    1261            0 :         transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
    1262              :       }
    1263              :     }
    1264              :     break;
    1265         1092 :   case 1:
    1266         1092 :     if (indexJ == 0) {
    1267         1086 :       if (is_format_nchw) {
    1268       118863 :         transposeloop(l, j, i, k, SL, SJ, SI, SK);
    1269              :       } else {
    1270            0 :         transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
    1271              :       }
    1272              :     } else {
    1273            6 :       if (is_format_nchw) {
    1274         2928 :         transposeloop(l, j, k, i, SL, SJ, SK, SI);
    1275              :       } else {
    1276            0 :         transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
    1277              :       }
    1278              :     }
    1279              :     break;
    1280            4 :   case 2:
    1281            4 :     if (indexJ == 0) {
    1282            2 :       if (is_format_nchw) {
    1283          338 :         transposeloop(l, k, i, j, SL, SK, SI, SJ);
    1284              :       } else {
    1285            0 :         transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
    1286              :       }
    1287              :     } else {
    1288            2 :       if (is_format_nchw) {
    1289          398 :         transposeloop(l, k, j, i, SL, SK, SJ, SI);
    1290              :       } else {
    1291            0 :         transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
    1292              :       }
    1293              :     }
    1294              :     break;
    1295              :   }
    1296              : 
    1297         1113 :   return output;
    1298              : }
    1299              : 
    1300           10 : void FloatTensor::dropout_mask(float dropout) {
    1301           10 :   float scale = 1.0 / (1 - dropout);
    1302           10 :   float *data_ = (float *)getData();
    1303          370 :   for (unsigned int i = 0; i < size(); ++i) {
    1304          360 :     if (data_[i] >= dropout)
    1305          148 :       data_[i] = scale;
    1306              :     else
    1307          212 :       data_[i] = 0.0;
    1308              :   }
    1309           10 : }
    1310              : 
    1311            0 : void FloatTensor::filter_mask(const Tensor &mask_len, bool reverse) {
    1312              :   float fill_mask_val = 0.0;
    1313              :   float en_mask_val = 1.0 - fill_mask_val;
    1314              : 
    1315            0 :   if (reverse) {
    1316              :     fill_mask_val = 1.0;
    1317              :     en_mask_val = 1.0 - fill_mask_val;
    1318              :   }
    1319              : 
    1320            0 :   setValue(fill_mask_val);
    1321              : 
    1322            0 :   NNTR_THROW_IF(mask_len.batch() != batch(), std::invalid_argument)
    1323              :     << "Number of filter masks mismatched";
    1324              : 
    1325            0 :   for (unsigned int b = 0; b < batch(); b++) {
    1326            0 :     float *addr = (float *)getAddress(getIndex(b, 0, 0, 0));
    1327              :     const unsigned int *mask_len_val =
    1328            0 :       mask_len.getAddress<unsigned int>(b, 0, 0, 0);
    1329            0 :     std::fill(addr, addr + (*mask_len_val), en_mask_val);
    1330              :   }
    1331            0 : }
    1332              : 
    1333            3 : void FloatTensor::zoneout_mask(Tensor &opposite, float zoneout) {
    1334            3 :   opposite.setRandBernoulli(zoneout);
    1335              : 
    1336            3 :   float *data = (float *)getData();
    1337              :   float *opposite_data = opposite.getData<float>();
    1338              : 
    1339      2010003 :   for (unsigned int i = 0; i < size(); ++i) {
    1340      2010000 :     if (opposite_data[i] > epsilon) {
    1341       603513 :       data[i] = 0.0f;
    1342              :     } else {
    1343      1406487 :       data[i] = 1.0f;
    1344              :     }
    1345              :   }
    1346            3 : }
    1347              : 
    1348           13 : std::vector<Tensor> FloatTensor::split(std::vector<size_t> sizes, int axis) {
    1349              :   size_t num_size = sizes.size();
    1350              : 
    1351           13 :   if (axis == -1) {
    1352              :     axis = 3;
    1353              :   }
    1354              : 
    1355              :   size_t total_size =
    1356              :     std::accumulate(sizes.begin(), sizes.end(), static_cast<size_t>(0));
    1357           14 :   NNTR_THROW_IF(dim.getTensorDim(axis) != total_size, std::invalid_argument)
    1358              :     << "given sum of sizes did not match with origin tensor dim, tensor dim: "
    1359            1 :     << dim.getTensorDim(axis) << " total size: " << total_size;
    1360              : 
    1361           12 :   std::vector<TensorDim> ret_dims(num_size, dim);
    1362           43 :   for (unsigned int i = 0; i < num_size; ++i) {
    1363           31 :     ret_dims[i].setTensorDim(axis, sizes[i]);
    1364              :   }
    1365              : 
    1366           12 :   bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
    1367              :   std::vector<Tensor> ret;
    1368              : 
    1369         1248 :   auto iter_value = [this, is_format_nchw](
    1370              :                       std::array<size_t, 4> &loc,
    1371              :                       const std::array<size_t, 4> &end_loc,
    1372              :                       const std::array<size_t, 4> &reset_dim_arr) -> float & {
    1373         1248 :     auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
    1374            0 :                                    : getValue(loc[0], loc[3], loc[1], loc[2]);
    1375         1960 :     for (int i = 3; i >= 0; --i) {
    1376         1929 :       loc[i]++;
    1377         1929 :       if (loc[i] == end_loc[i]) {
    1378          712 :         loc[i] -= reset_dim_arr[i];
    1379              :         continue;
    1380              :       }
    1381              :       break;
    1382              :     }
    1383         1248 :     return value;
    1384           12 :   };
    1385              : 
    1386              :   unsigned int accumulated_size = 0;
    1387           43 :   for (unsigned int i = 0; i < num_size; ++i) {
    1388           31 :     std::array<size_t, 4> loc = {0, 0, 0, 0};
    1389              : 
    1390           31 :     if (is_format_nchw) {
    1391           31 :       loc[axis] += accumulated_size;
    1392              :     } else {
    1393            0 :       if (axis == 0) {
    1394            0 :         loc[0] += accumulated_size;
    1395            0 :       } else if (axis == 1) {
    1396            0 :         loc[3] += accumulated_size;
    1397            0 :       } else if (axis == 2 || axis == 3) {
    1398            0 :         loc[axis - 1] += accumulated_size;
    1399              :       }
    1400              :     }
    1401              : 
    1402           62 :     ret.push_back(Tensor(ret_dims[i]));
    1403              :     auto &ret_t = ret.back();
    1404              : 
    1405              :     std::array<size_t, 4> end_loc;
    1406              : 
    1407           31 :     if (is_format_nchw) {
    1408           62 :       end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
    1409           31 :                  ret_dims[i].height(), ret_dims[i].width()};
    1410              :     } else {
    1411            0 :       end_loc = {ret_dims[i].batch(), ret_dims[i].height(), ret_dims[i].width(),
    1412            0 :                  ret_dims[i].channel()};
    1413              :     }
    1414              : 
    1415           31 :     accumulated_size += sizes[i];
    1416              : 
    1417           31 :     if (is_format_nchw) {
    1418           31 :       end_loc[axis] = accumulated_size;
    1419              :     } else {
    1420            0 :       if (axis == 0) {
    1421            0 :         end_loc[0] = accumulated_size;
    1422            0 :       } else if (axis == 1) {
    1423            0 :         end_loc[3] = accumulated_size;
    1424            0 :       } else if (axis == 2 || axis == 3) {
    1425            0 :         end_loc[axis - 1] = accumulated_size;
    1426              :       }
    1427              :     }
    1428              : 
    1429              :     std::array<size_t, 4> reset_dim_arr;
    1430              :     if (is_format_nchw) {
    1431           62 :       reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
    1432           31 :                        ret_dims[i].height(), ret_dims[i].width()};
    1433              :     } else {
    1434            0 :       reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
    1435            0 :                        ret_dims[i].width(), ret_dims[i].channel()};
    1436              :     }
    1437              : 
    1438           31 :     ret_t.apply_i<float>(
    1439           62 :       [&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
    1440         1248 :         return iter_value(loc, end_loc, reset_dim_arr);
    1441              :       });
    1442              :   }
    1443              : 
    1444           12 :   return ret;
    1445           12 : }
    1446              : 
    1447            5 : Tensor FloatTensor::concat(const std::vector<Tensor> &tensors, int axis,
    1448              :                            Tensor &output) {
    1449            5 :   bool is_format_nchw = (tensors.front().getDim().getFormat() == Tformat::NCHW);
    1450              : 
    1451              :   auto iter_value =
    1452          746 :     [is_format_nchw](std::array<unsigned, 4> &loc,
    1453              :                      const std::array<unsigned, 4> &start_loc, Tensor &t,
    1454              :                      const std::array<unsigned, 4> &ref_dim_arr) -> float & {
    1455          746 :     auto &value = is_format_nchw
    1456          746 :                     ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
    1457            0 :                     : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
    1458              : 
    1459         1044 :     for (int i = 3; i >= 0; --i) {
    1460         1033 :       loc[i]++;
    1461         1033 :       if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
    1462          298 :         loc[i] = start_loc[i];
    1463              :         continue;
    1464              :       }
    1465              :       break;
    1466              :     }
    1467          746 :     return value;
    1468            5 :   };
    1469              : 
    1470            5 :   std::array<unsigned, 4> loc = {0, 0, 0, 0};
    1471           16 :   for (auto &t : tensors) {
    1472           11 :     std::array<unsigned, 4> start_loc = loc;
    1473              :     std::array<unsigned, 4> tensor_dim_arr;
    1474           11 :     TensorDim curr_dim = t.getDim();
    1475              : 
    1476           11 :     tensor_dim_arr[0] = curr_dim.getTensorDim(0);
    1477           22 :     tensor_dim_arr[1] =
    1478           11 :       is_format_nchw ? curr_dim.getTensorDim(1) : curr_dim.getTensorDim(2);
    1479           22 :     tensor_dim_arr[2] =
    1480           11 :       is_format_nchw ? curr_dim.getTensorDim(2) : curr_dim.getTensorDim(3);
    1481           22 :     tensor_dim_arr[3] =
    1482           11 :       is_format_nchw ? curr_dim.getTensorDim(3) : curr_dim.getTensorDim(1);
    1483              : 
    1484          757 :     for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
    1485          746 :       iter_value(loc, start_loc, output, tensor_dim_arr) = t.getValue<float>(i);
    1486              :     }
    1487              : 
    1488           11 :     if (is_format_nchw) {
    1489           11 :       loc[axis] += curr_dim.getTensorDim(axis);
    1490              :     } else {
    1491            0 :       if (axis == 0) {
    1492            0 :         loc[0] += curr_dim.getTensorDim(axis);
    1493            0 :       } else if (axis == 1) {
    1494            0 :         loc[3] += curr_dim.getTensorDim(axis);
    1495            0 :       } else if (axis == 2 || axis == 3) {
    1496            0 :         loc[axis - 1] += curr_dim.getTensorDim(axis);
    1497              :       }
    1498              :     }
    1499              :   }
    1500              : 
    1501            5 :   return output;
    1502              : }
    1503              : 
    1504           10 : void FloatTensor::print(std::ostream &out) const {
    1505           10 :   const float *data = (float *)getData();
    1506           10 :   unsigned int len = size();
    1507           10 :   out << "data addr: " << data << '\n';
    1508           10 :   out << dim;
    1509              : 
    1510           10 :   if (len > 100) {
    1511            6 :     out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
    1512            6 :         << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1] << ']'
    1513              :         << std::endl;
    1514            6 :     return;
    1515              :   }
    1516              : 
    1517            4 :   std::ios init(NULL);
    1518            4 :   init.copyfmt(out);
    1519              : 
    1520            4 :   if (getFormat() == Tformat::NCHW) {
    1521           10 :     for (unsigned int k = 0; k < batch(); k++) {
    1522           12 :       for (unsigned int l = 0; l < channel(); l++) {
    1523           17 :         for (unsigned int i = 0; i < height(); i++) {
    1524           47 :           for (unsigned int j = 0; j < width(); j++) {
    1525              :             out << std::setw(10) << std::setprecision(10)
    1526           36 :                 << data[getIndex(k, l, i, j)] << " ";
    1527              :           }
    1528              :           out << std::endl;
    1529              :         }
    1530              :         out << std::endl;
    1531              :       }
    1532              :       out << "-------" << std::endl;
    1533              :     }
    1534              :   } else {
    1535            0 :     for (unsigned int k = 0; k < batch(); k++) {
    1536            0 :       for (unsigned int i = 0; i < height(); i++) {
    1537            0 :         for (unsigned int j = 0; j < width(); j++) {
    1538            0 :           for (unsigned int l = 0; l < channel(); l++) {
    1539              :             out << std::setw(10) << std::setprecision(10)
    1540            0 :                 << data[getIndex(k, l, i, j)] << " ";
    1541              :           }
    1542              :           out << std::endl;
    1543              :         }
    1544              :         out << std::endl;
    1545              :       }
    1546              :       out << "-------" << std::endl;
    1547              :     }
    1548              :   }
    1549            4 :   out.copyfmt(init);
    1550              : }
    1551              : 
    1552       112088 : void FloatTensor::copy(const void *buf) {
    1553       112088 :   NNTR_THROW_IF(!contiguous, std::invalid_argument)
    1554              :     << getName() << " is not contiguous, cannot copy.";
    1555              : 
    1556       112088 :   if (buf == getData()) {
    1557              :     return;
    1558              :   }
    1559              : 
    1560       111785 :   scopy(size(), (float *)buf, 1, (float *)getData(), 1);
    1561              : }
    1562              : 
    1563       100882 : void FloatTensor::apply_broadcast_util(
    1564              :   Tensor const &m,
    1565              :   std::function<void(const BroadcastInfo &e, const float *, const float *,
    1566              :                      float *)>
    1567              :     v_func,
    1568              :   Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
    1569              :   size_t m_offset) const {
    1570              : 
    1571       100882 :   const float *buf = (float *)this->getData();
    1572              :   const float *m_buf = m.getData<float>();
    1573              :   float *out_buf = output.getData<float>();
    1574              : 
    1575       100882 :   if (e.buffer_axis == cur_axis) {
    1576        77211 :     v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
    1577        77211 :     return;
    1578              :   }
    1579              : 
    1580        23671 :   cur_axis++;
    1581        23671 :   unsigned int continuity[4] = {0, 1, 2, 3};
    1582        23671 :   if (getFormat() == Tformat::NHWC) {
    1583           10 :     continuity[1] = 2;
    1584           10 :     continuity[2] = 3;
    1585           10 :     continuity[3] = 1;
    1586              :   }
    1587       113890 :   for (unsigned int i = 0; i < dim.getTensorDim(continuity[cur_axis]); ++i) {
    1588        90219 :     size_t next_offset = offset + i * strides[cur_axis];
    1589        90219 :     size_t next_m_offset = m_offset + i * e.strides[cur_axis];
    1590       180438 :     apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
    1591              :                          next_m_offset);
    1592              :   }
    1593              : }
    1594              : 
    1595        76795 : void FloatTensor::apply_broadcast(
    1596              :   Tensor const &m,
    1597              :   std::function<void(const BroadcastInfo &e, const float *, const float *,
    1598              :                      float *)>
    1599              :     v_func,
    1600              :   Tensor &output) const {
    1601        91891 :   CREATE_IF_EMPTY_DIMS(output, dim);
    1602              : 
    1603        76799 :   NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
    1604              :     << getName() << " is not allocated";
    1605        76795 :   NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
    1606            4 :     << m.getName() << " is not allocated";
    1607        76791 :   NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
    1608            4 :     << output.getName() << " is not allocated";
    1609              : 
    1610              :   /// shortcut to cover when dimension matches
    1611              :   /// note that buffer_size, the last stride is only used in v_func but it
    1612              :   /// might be changed
    1613        76783 :   if (dim == m.getDim()) {
    1614              :     BroadcastInfo e;
    1615        65954 :     e.buffer_size = size();
    1616        65954 :     e.strides[3] = 1;
    1617        65954 :     e.tensor_type = getTensorType();
    1618       131908 :     v_func(e, (float *)getData(), m.getData<float>(), output.getData<float>());
    1619              :     return;
    1620              :   }
    1621              : 
    1622        21492 :   return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
    1623              : }
    1624              : 
    1625           12 : bool FloatTensor::isValid() const {
    1626           12 :   return is_valid(dim.getDataLen(), (float *)getData());
    1627              : }
    1628              : 
    1629              : } // namespace nntrainer
        

Generated by: LCOV version 2.0-1