LCOV - code coverage report
Current view: top level - nntrainer/tensor - tensor.cpp (source / functions) Coverage Total Hit
Test: coverage_filtered.info Lines: 73.8 % 856 632
Test Date: 2025-12-14 20:38:17 Functions: 87.0 % 169 147

            Line data    Source code
       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * @file        tensor.cpp
       4              :  * @date        01 December 2023
       5              :  * @brief       This is a Tensor class
       6              :  * @see         https://github.com/nnstreamer/nntrainer
       7              :  * @author      Jijoong Moon <jijoong.moon@samsung.com>
       8              :  * @author      Donghyeon Jeong <dhyeon.jeong@samsung.com>
       9              :  * @bug         No known bugs except for NYI items
      10              :  */
      11              : 
      12              : #include <numeric>
      13              : 
      14              : #include <char_tensor.h>
      15              : #include <float_tensor.h>
      16              : #include <int4_tensor.h>
      17              : #include <lazy_tensor.h>
      18              : #include <q4_0_tensor.h>
      19              : #include <q4_k_tensor.h>
      20              : #include <q6_k_tensor.h>
      21              : #include <short_tensor.h>
      22              : #include <tensor.h>
      23              : #include <uint4_tensor.h>
      24              : #include <uint_tensor.h>
      25              : 
      26              : #ifdef ENABLE_FP16
      27              : #include <half_tensor.h>
      28              : #endif
      29              : 
      30              : #ifdef ENABLE_BIQGEMM
      31              : #include <bcq_tensor.h>
      32              : #endif
      33              : 
      34              : #include <fcntl.h>
      35              : 
      36              : #if defined(__unix__) || defined(__ANDROID__) || defined(__arm__)
      37              : #include <sys/mman.h>
      38              : #include <sys/stat.h>
      39              : #include <unistd.h>
      40              : #endif
      41              : 
      42              : namespace nntrainer {
      43              : 
      44            1 : Tensor::Tensor(
      45              :   std::vector<std::vector<std::vector<std::vector<int16_t>>>> const &d,
      46              :   std::vector<float> const &scales, ml::train::TensorDim::TensorType t_type,
      47            1 :   QScheme qscheme_) {
      48              :   switch (qscheme_) {
      49              :   case QScheme::PER_TENSOR_AFFINE:
      50              :     break;
      51              :   case QScheme::PER_CHANNEL_AFFINE:
      52              :     break;
      53              :   default:
      54              :     break;
      55              :   }
      56            1 :   itensor_ = std::make_unique<ShortTensor>(d, scales, t_type.format, qscheme_);
      57            1 : }
      58              : 
      59            7 : Tensor::Tensor(
      60              :   std::vector<std::vector<std::vector<std::vector<int8_t>>>> const &d,
      61              :   std::vector<float> const &scales, ml::train::TensorDim::TensorType t_type,
      62            7 :   QScheme qscheme_) {
      63            7 :   if (t_type.data_type == Tdatatype::QINT4) {
      64              :     itensor_ =
      65            9 :       std::make_unique<Int4QTensor>(d, scales, t_type.format, qscheme_);
      66            2 :   } else if (t_type.data_type == Tdatatype::QINT8) {
      67            4 :     itensor_ = std::make_unique<CharTensor>(d, scales, t_type.format, qscheme_);
      68              :   } else {
      69              :     throw std::invalid_argument(
      70              :       "Error: Tensor cannot be constructed because the given data type is "
      71            0 :       "incorrect. The supported d_types are: QINT4, QINT8");
      72              :   }
      73            5 : }
      74              : 
      75           31 : Tensor::Tensor(
      76              :   std::vector<std::vector<std::vector<std::vector<float>>>> const &d,
      77           31 :   ml::train::TensorDim::TensorType t_type) {
      78           31 :   itensor_ = std::make_unique<FloatTensor>(d, t_type.format);
      79           30 : }
      80              : 
      81            1 : Tensor::Tensor(
      82              :   std::vector<std::vector<std::vector<std::vector<uint8_t>>>> const &d,
      83              :   std::vector<float> const &scales,
      84              :   std::vector<unsigned int> const &zero_points,
      85            1 :   ml::train::TensorDim::TensorType t_type, QScheme qscheme_) {
      86            1 :   if (t_type.data_type == Tdatatype::UINT4) {
      87            0 :     itensor_ = std::make_unique<Uint4QTensor>(d, scales, zero_points,
      88              :                                               t_type.format, qscheme_);
      89            1 :   } else if (t_type.data_type == Tdatatype::UINT8) {
      90            2 :     itensor_ = std::make_unique<UInt8Tensor>(d, scales, zero_points,
      91              :                                              t_type.format, qscheme_);
      92              :   } else {
      93              :     throw std::invalid_argument(
      94              :       "Error: Tensor cannot be constructed because the given data type is "
      95            0 :       "incorrect. The supported d_types are: UINT4, UINT8");
      96              :   }
      97            1 : }
      98              : 
      99            5 : Tensor::Tensor(
     100              :   std::vector<std::vector<std::vector<std::vector<uint16_t>>>> const &d,
     101              :   std::vector<float> const &scales,
     102              :   std::vector<unsigned int> const &zero_points,
     103            5 :   ml::train::TensorDim::TensorType t_type, QScheme qscheme_) {
     104            5 :   itensor_ = std::make_unique<UInt16Tensor>(d, scales, zero_points,
     105              :                                             t_type.format, qscheme_);
     106            2 : }
     107              : 
     108            1 : Tensor::Tensor(
     109              :   std::vector<std::vector<std::vector<std::vector<uint32_t>>>> const &d,
     110              :   std::vector<float> const &scales,
     111              :   std::vector<unsigned int> const &zero_points,
     112            1 :   ml::train::TensorDim::TensorType t_type, QScheme qscheme_) {
     113            1 :   itensor_ = std::make_unique<UInt32Tensor>(d, scales, zero_points,
     114              :                                             t_type.format, qscheme_);
     115            1 : }
     116              : 
     117       282561 : Tensor::Tensor(std::string name_, Tformat fm, Tdatatype d_type) {
     118              :   itensor_ = nullptr;
     119              : 
     120              :   if (d_type == Tdatatype::FP32) {
     121       565114 :     itensor_ = std::make_unique<FloatTensor>(name_, fm);
     122              :   } else if (d_type == Tdatatype::FP16) {
     123              : #ifdef ENABLE_FP16
     124              :     itensor_ = std::make_unique<HalfTensor>(name_, fm);
     125              : #else
     126            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
     127              : #endif
     128              :   } else if (d_type == Tdatatype::Q4_K) {
     129            0 :     itensor_ = std::make_unique<Q4_K_Tensor>(name_, fm);
     130              :   } else if (d_type == Tdatatype::Q6_K) {
     131            0 :     itensor_ = std::make_unique<Q6_K_Tensor>(name_, fm);
     132              :   } else if (d_type == Tdatatype::Q4_0) {
     133            0 :     itensor_ = std::make_unique<Q4_0_Tensor>(name_, fm);
     134              :   } else if (d_type == Tdatatype::UINT4) {
     135            0 :     itensor_ = std::make_unique<Uint4QTensor>(name_, fm);
     136              :   } else if (d_type == Tdatatype::UINT8) {
     137            2 :     itensor_ = std::make_unique<UInt8Tensor>(name_, fm);
     138              :   } else if (d_type == Tdatatype::UINT16) {
     139            2 :     itensor_ = std::make_unique<UInt16Tensor>(name_, fm);
     140              :   } else if (d_type == Tdatatype::UINT32) {
     141            2 :     itensor_ = std::make_unique<UInt32Tensor>(name_, fm);
     142              :   } else if (d_type == Tdatatype::QINT16) {
     143            0 :     itensor_ = std::make_unique<ShortTensor>(name_, fm);
     144              :   } else if (d_type == Tdatatype::QINT8) {
     145            2 :     itensor_ = std::make_unique<CharTensor>(name_, fm);
     146              :   } else if (d_type == Tdatatype::QINT4) {
     147            0 :     itensor_ = std::make_unique<Int4QTensor>(name_, fm);
     148              :   } else if (d_type == Tdatatype::BCQ) {
     149              : #ifdef ENABLE_BIQGEMM
     150              :     itensor_ = std::make_unique<BCQTensor>(name_, fm);
     151              : #else
     152              :     throw std::invalid_argument("Error: enable-biqgemm is not activated. "
     153            0 :                                 "Enable only if your system supports BiQGEMM.");
     154              : #endif
     155              :   } else {
     156              :     throw std::invalid_argument(
     157              :       "Error: Tensor cannot be constructed because the given d_type is not "
     158              :       "compatible with itensor. The supported d_types are: FP32, FP16 "
     159            0 :       "(if built with ENABLE_FP16).");
     160              :   }
     161       282561 : }
     162              : 
     163        39609 : Tensor::Tensor(const TensorDim &d, bool alloc_now, Initializer init,
     164        39609 :                std::string name, QScheme qscheme, bool is_virtual) {
     165              :   itensor_ = nullptr;
     166        39609 :   this->is_virtual = is_virtual;
     167              : 
     168              :   if (d.getDataType() == Tdatatype::FP32) {
     169        79138 :     itensor_ = std::make_unique<FloatTensor>(d, alloc_now, init, name);
     170              :   } else if (d.getDataType() == Tdatatype::FP16) {
     171              : #ifdef ENABLE_FP16
     172              :     itensor_ = std::make_unique<HalfTensor>(d, alloc_now, init, name);
     173              : #else
     174            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
     175              : #endif
     176              :   } else if (d.getDataType() == Tdatatype::Q4_K) {
     177            6 :     itensor_ = std::make_unique<Q4_K_Tensor>(d, alloc_now, init, name);
     178              :   } else if (d.getDataType() == Tdatatype::Q6_K) {
     179            4 :     itensor_ = std::make_unique<Q6_K_Tensor>(d, alloc_now, init, name);
     180              :   } else if (d.getDataType() == Tdatatype::Q4_0) {
     181            6 :     itensor_ = std::make_unique<Q4_0_Tensor>(d, alloc_now, init, name);
     182              :   } else if (d.getDataType() == Tdatatype::UINT4) {
     183            5 :     if (qscheme != QScheme::Q4_Kx8) {
     184              :       itensor_ =
     185            0 :         std::make_unique<Uint4QTensor>(d, alloc_now, init, name, qscheme);
     186              :     } else {
     187              :       itensor_ =
     188            8 :         std::make_unique<Q4_K_Tensor>(d, alloc_now, init, name, qscheme);
     189              :     }
     190              :   } else if (d.getDataType() == Tdatatype::UINT8) {
     191            8 :     itensor_ = std::make_unique<UInt8Tensor>(d, alloc_now, init, name);
     192              :   } else if (d.getDataType() == Tdatatype::UINT16) {
     193            8 :     itensor_ = std::make_unique<UInt16Tensor>(d, alloc_now, init, name);
     194              :   } else if (d.getDataType() == Tdatatype::UINT32) {
     195            8 :     itensor_ = std::make_unique<UInt32Tensor>(d, alloc_now, init, name);
     196              :   } else if (d.getDataType() == Tdatatype::QINT16) {
     197            0 :     itensor_ = std::make_unique<ShortTensor>(d, alloc_now, init, name, qscheme);
     198              :   } else if (d.getDataType() == Tdatatype::QINT8) {
     199           18 :     itensor_ = std::make_unique<CharTensor>(d, alloc_now, init, name, qscheme);
     200              :   } else if (d.getDataType() == Tdatatype::QINT4) {
     201           14 :     itensor_ = std::make_unique<Int4QTensor>(d, alloc_now, init, name, qscheme);
     202              :   } else if (d.getDataType() == Tdatatype::BCQ) {
     203              : #ifdef ENABLE_BIQGEMM
     204              :     itensor_ = std::make_unique<BCQTensor>(d, alloc_now, init, name);
     205              : #else
     206              :     throw std::invalid_argument("Error: enable-biqgemm is not activated. "
     207            0 :                                 "Enable only if your system supports BiQGEMM.");
     208              : #endif
     209              :   } else {
     210              :     throw std::invalid_argument(
     211              :       "Error: Tensor cannot be constructed because the given d_type is not "
     212              :       "compatible with itensor. The supported d_types are: FP32, FP16 "
     213            0 :       "(if built with ENABLE_FP16).");
     214              :   }
     215        39603 : }
     216              : 
     217       336059 : Tensor::Tensor(const TensorDim &d, const void *buf, QScheme qscheme) {
     218              :   itensor_ = nullptr;
     219              : 
     220              :   if (d.getDataType() == Tdatatype::FP32) {
     221       671952 :     itensor_ = std::make_unique<FloatTensor>(d, buf);
     222              :   } else if (d.getDataType() == Tdatatype::FP16) {
     223              : #ifdef ENABLE_FP16
     224              :     itensor_ = std::make_unique<HalfTensor>(d, buf);
     225              : #else
     226            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
     227              : #endif
     228              :   } else if (d.getDataType() == Tdatatype::Q4_K) {
     229            0 :     itensor_ = std::make_unique<Q4_K_Tensor>(d, buf);
     230              :   } else if (d.getDataType() == Tdatatype::Q6_K) {
     231            6 :     itensor_ = std::make_unique<Q6_K_Tensor>(d, buf);
     232              :   } else if (d.getDataType() == Tdatatype::Q4_0) {
     233            0 :     itensor_ = std::make_unique<Q4_0_Tensor>(d, buf);
     234              :   } else if (d.getDataType() == Tdatatype::UINT4) {
     235            0 :     if (qscheme != QScheme::Q4_Kx8)
     236            0 :       itensor_ = std::make_unique<Uint4QTensor>(d, buf, qscheme);
     237              :     else
     238            0 :       itensor_ = std::make_unique<Q4_K_Tensor>(d, buf, qscheme);
     239              :   } else if (d.getDataType() == Tdatatype::UINT8) {
     240           16 :     itensor_ = std::make_unique<UInt8Tensor>(d, buf);
     241              :   } else if (d.getDataType() == Tdatatype::UINT16) {
     242           46 :     itensor_ = std::make_unique<UInt16Tensor>(d, buf);
     243              :   } else if (d.getDataType() == Tdatatype::UINT32) {
     244           18 :     itensor_ = std::make_unique<UInt32Tensor>(d, buf);
     245              :   } else if (d.getDataType() == Tdatatype::QINT16) {
     246           12 :     itensor_ = std::make_unique<ShortTensor>(d, buf, qscheme);
     247              :   } else if (d.getDataType() == Tdatatype::QINT8) {
     248           64 :     itensor_ = std::make_unique<CharTensor>(d, buf, qscheme);
     249              :   } else if (d.getDataType() == Tdatatype::QINT4) {
     250            4 :     itensor_ = std::make_unique<Int4QTensor>(d, buf);
     251              :   } else if (d.getDataType() == Tdatatype::BCQ) {
     252              : #ifdef ENABLE_BIQGEMM
     253              :     itensor_ = std::make_unique<BCQTensor>(d, buf);
     254              : #else
     255              :     throw std::invalid_argument("Error: enable-biqgemm is not activated. "
     256            0 :                                 "Enable only if your system supports BiQGEMM.");
     257              : #endif
     258              :   } else {
     259              :     throw std::invalid_argument(
     260              :       "Error: Tensor cannot be constructed because the given d_type is not "
     261              :       "compatible with itensor. The supported d_types are: FP32, FP16 "
     262            0 :       "(if built with ENABLE_FP16).");
     263              :   }
     264       336059 : }
     265              : 
     266       533568 : Tensor::Tensor(const Tensor &rhs) {
     267       533568 :   if (rhs.getDataType() == Tdatatype::FP32) {
     268      1067108 :     itensor_ = std::make_unique<FloatTensor>(*rhs.itensor_);
     269           14 :   } else if (rhs.getDataType() == Tdatatype::FP16) {
     270              : #ifdef ENABLE_FP16
     271              :     itensor_ = std::make_unique<HalfTensor>(*rhs.itensor_);
     272              : #else
     273            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
     274              : #endif
     275           14 :   } else if (rhs.getDataType() == Tdatatype::Q4_K) {
     276            0 :     itensor_ = std::make_unique<Q4_K_Tensor>(*rhs.itensor_);
     277           14 :   } else if (rhs.getDataType() == Tdatatype::Q6_K) {
     278            0 :     itensor_ = std::make_unique<Q6_K_Tensor>(*rhs.itensor_);
     279           14 :   } else if (rhs.getDataType() == Tdatatype::Q4_0) {
     280            0 :     itensor_ = std::make_unique<Q4_0_Tensor>(*rhs.itensor_);
     281           14 :   } else if (rhs.getDataType() == Tdatatype::UINT4) {
     282            0 :     itensor_ = std::make_unique<Uint4QTensor>(*rhs.itensor_);
     283           14 :   } else if (rhs.getDataType() == Tdatatype::UINT8) {
     284            2 :     itensor_ = std::make_unique<UInt8Tensor>(*rhs.itensor_);
     285           13 :   } else if (rhs.getDataType() == Tdatatype::UINT16) {
     286            6 :     itensor_ = std::make_unique<UInt16Tensor>(*rhs.itensor_);
     287           10 :   } else if (rhs.getDataType() == Tdatatype::UINT32) {
     288           14 :     itensor_ = std::make_unique<UInt32Tensor>(*rhs.itensor_);
     289            3 :   } else if (rhs.getDataType() == Tdatatype::QINT16) {
     290            0 :     itensor_ = std::make_unique<ShortTensor>(*rhs.itensor_);
     291            3 :   } else if (rhs.getDataType() == Tdatatype::QINT8) {
     292            6 :     itensor_ = std::make_unique<CharTensor>(*rhs.itensor_);
     293            0 :   } else if (rhs.getDataType() == Tdatatype::QINT4) {
     294            0 :     itensor_ = std::make_unique<Int4QTensor>(*rhs.itensor_);
     295            0 :   } else if (rhs.getDataType() == Tdatatype::BCQ) {
     296              : #ifdef ENABLE_BIQGEMM
     297              :     itensor_ = std::make_unique<BCQTensor>(*rhs.itensor_);
     298              : #else
     299              :     throw std::invalid_argument("Error: enable-biqgemm is not activated. "
     300            0 :                                 "Enable only if your system supports BiQGEMM.");
     301              : #endif
     302              :   }
     303              : 
     304              :   /** copy tensor properties */
     305       533568 :   this->is_virtual = rhs.is_virtual;
     306       533568 :   this->fd = rhs.fd;
     307       533568 :   this->read_offset = rhs.read_offset;
     308       533568 :   this->mapped_ptr = rhs.mapped_ptr;
     309       533568 : }
     310              : 
     311            2 : Tensor::Tensor(const std::unique_ptr<TensorBase> &rhs) {
     312            3 :   NNTR_THROW_IF(rhs.get() == nullptr, std::invalid_argument)
     313              :     << "Error: received a nullptr. Tensor cannot be constructed";
     314              : 
     315              :   if (rhs->getDataType() == Tdatatype::FP32) {
     316            2 :     itensor_ = std::make_unique<FloatTensor>(*rhs.get());
     317              :   } else if (rhs->getDataType() == Tdatatype::FP16) {
     318              : #ifdef ENABLE_FP16
     319              :     itensor_ = std::make_unique<HalfTensor>(*rhs.get());
     320              : #else
     321            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
     322              : #endif
     323              :   } else if (rhs->getDataType() == Tdatatype::UINT4) {
     324            0 :     itensor_ = std::make_unique<Uint4QTensor>(*rhs.get());
     325              :   } else if (rhs->getDataType() == Tdatatype::UINT8) {
     326            0 :     itensor_ = std::make_unique<UInt8Tensor>(*rhs.get());
     327              :   } else if (rhs->getDataType() == Tdatatype::UINT16) {
     328            0 :     itensor_ = std::make_unique<UInt16Tensor>(*rhs.get());
     329              :   } else if (rhs->getDataType() == Tdatatype::UINT32) {
     330            0 :     itensor_ = std::make_unique<UInt32Tensor>(*rhs.get());
     331              :   } else if (rhs->getDataType() == Tdatatype::QINT16) {
     332            0 :     itensor_ = std::make_unique<ShortTensor>(*rhs.get());
     333              :   } else if (rhs->getDataType() == Tdatatype::QINT8) {
     334            0 :     itensor_ = std::make_unique<CharTensor>(*rhs.get());
     335              :   } else if (rhs->getDataType() == Tdatatype::QINT4) {
     336            1 :     itensor_ = std::make_unique<Int4QTensor>(*rhs.get());
     337              :   } else if (rhs->getDataType() == Tdatatype::BCQ) {
     338              : #ifdef ENABLE_BIQGEMM
     339              :     itensor_ = std::make_unique<BCQTensor>(*rhs.get());
     340              : #else
     341              :     throw std::invalid_argument("Error: enable-biqgemm is not activated. "
     342            0 :                                 "Enable only if your system supports BiQGEMM.");
     343              : #endif
     344              :   }
     345            1 : }
     346              : 
     347         6741 : Tensor &Tensor::operator=(const Tensor &rhs) {
     348         6741 :   if (rhs.getDataType() == Tdatatype::FP32) {
     349        13482 :     itensor_ = std::make_unique<FloatTensor>(*rhs.itensor_);
     350            0 :   } else if (rhs.getDataType() == Tdatatype::FP16) {
     351              : #ifdef ENABLE_FP16
     352              :     itensor_ = std::make_unique<HalfTensor>(*rhs.itensor_);
     353              : #else
     354            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
     355              : #endif
     356            0 :   } else if (rhs.getDataType() == Tdatatype::Q4_K) {
     357            0 :     itensor_ = std::make_unique<Q4_K_Tensor>(*rhs.itensor_);
     358            0 :   } else if (rhs.getDataType() == Tdatatype::Q6_K) {
     359            0 :     itensor_ = std::make_unique<Q6_K_Tensor>(*rhs.itensor_);
     360            0 :   } else if (rhs.getDataType() == Tdatatype::Q4_0) {
     361            0 :     itensor_ = std::make_unique<Q4_0_Tensor>(*rhs.itensor_);
     362            0 :   } else if (rhs.getDataType() == Tdatatype::UINT4) {
     363            0 :     itensor_ = std::make_unique<Uint4QTensor>(*rhs.itensor_);
     364            0 :   } else if (rhs.getDataType() == Tdatatype::UINT8) {
     365            0 :     itensor_ = std::make_unique<UInt8Tensor>(*rhs.itensor_);
     366            0 :   } else if (rhs.getDataType() == Tdatatype::UINT16) {
     367            0 :     itensor_ = std::make_unique<UInt16Tensor>(*rhs.itensor_);
     368            0 :   } else if (rhs.getDataType() == Tdatatype::UINT32) {
     369            0 :     itensor_ = std::make_unique<UInt32Tensor>(*rhs.itensor_);
     370            0 :   } else if (rhs.getDataType() == Tdatatype::QINT16) {
     371            0 :     itensor_ = std::make_unique<ShortTensor>(*rhs.itensor_);
     372            0 :   } else if (rhs.getDataType() == Tdatatype::QINT8) {
     373            0 :     itensor_ = std::make_unique<CharTensor>(*rhs.itensor_);
     374            0 :   } else if (rhs.getDataType() == Tdatatype::QINT4) {
     375            0 :     itensor_ = std::make_unique<Int4QTensor>(*rhs.itensor_);
     376            0 :   } else if (rhs.getDataType() == Tdatatype::BCQ) {
     377              : #ifdef ENABLE_BIQGEMM
     378              :     itensor_ = std::make_unique<BCQTensor>(*rhs.itensor_);
     379              : #else
     380              :     throw std::invalid_argument("Error: enable-biqgemm is not activated. "
     381            0 :                                 "Enable only if your system supports BiQGEMM.");
     382              : #endif
     383              :   }
     384              : 
     385              :   /** copy tensor properties */
     386         6741 :   this->is_virtual = rhs.is_virtual;
     387         6741 :   this->fd = rhs.fd;
     388         6741 :   this->read_offset = rhs.read_offset;
     389         6741 :   this->mapped_ptr = rhs.mapped_ptr;
     390         6741 :   return *this;
     391              : }
     392              : 
     393        10826 : bool Tensor::operator==(const Tensor &rhs) const {
     394              :   /// compares tensor information
     395        10826 :   if (*itensor_.get() == *rhs.itensor_.get()) {
     396              :     /// compares tensor data
     397        10824 :     if (getDataType() == Tdatatype::FP32) {
     398        10793 :       return itensorCompare<FloatTensor>(itensor_.get(), rhs.itensor_.get());
     399           31 :     } else if (getDataType() == Tdatatype::FP16) {
     400              : #ifdef ENABLE_FP16
     401              :       return itensorCompare<HalfTensor>(itensor_.get(), rhs.itensor_.get());
     402              : #else
     403              :       throw std::invalid_argument(
     404              :         "Error: HalfTensor cannot be created or used when FP16 is not enabled. "
     405            0 :         "Please check if the tensor data type is set properly.");
     406              : #endif
     407           31 :     } else if (getDataType() == Tdatatype::Q4_K) {
     408            0 :       return itensorCompare<Q4_K_Tensor>(itensor_.get(), rhs.itensor_.get());
     409           31 :     } else if (getDataType() == Tdatatype::Q6_K) {
     410            0 :       return itensorCompare<Q6_K_Tensor>(itensor_.get(), rhs.itensor_.get());
     411           31 :     } else if (getDataType() == Tdatatype::Q4_0) {
     412            0 :       return itensorCompare<Q4_0_Tensor>(itensor_.get(), rhs.itensor_.get());
     413           31 :     } else if (getDataType() == Tdatatype::UINT4) {
     414            1 :       return itensorCompare<Uint4QTensor>(itensor_.get(), rhs.itensor_.get());
     415           30 :     } else if (getDataType() == Tdatatype::UINT8) {
     416            6 :       return itensorCompare<UInt8Tensor>(itensor_.get(), rhs.itensor_.get());
     417           24 :     } else if (getDataType() == Tdatatype::UINT16) {
     418            7 :       return itensorCompare<UInt16Tensor>(itensor_.get(), rhs.itensor_.get());
     419           17 :     } else if (getDataType() == Tdatatype::UINT32) {
     420            4 :       return itensorCompare<UInt32Tensor>(itensor_.get(), rhs.itensor_.get());
     421           13 :     } else if (getDataType() == Tdatatype::QINT16) {
     422            3 :       return itensorCompare<ShortTensor>(itensor_.get(), rhs.itensor_.get());
     423           10 :     } else if (getDataType() == Tdatatype::QINT8) {
     424            8 :       return itensorCompare<CharTensor>(itensor_.get(), rhs.itensor_.get());
     425            2 :     } else if (getDataType() == Tdatatype::QINT4) {
     426            2 :       return itensorCompare<Int4QTensor>(itensor_.get(), rhs.itensor_.get());
     427            0 :     } else if (getDataType() == Tdatatype::BCQ) {
     428              : #ifdef ENABLE_BIQGEMM
     429              :       return itensorCompare<BCQTensor>(itensor_.get(), rhs.itensor_.get());
     430              : #else
     431              :       throw std::invalid_argument(
     432              :         "Error: enable-biqgemm is not activated. "
     433            0 :         "Enable only if your system supports BiQGEMM.");
     434              : #endif
     435              :     }
     436              :   }
     437              :   return false;
     438              : }
     439              : 
     440           31 : void Tensor::allocate() { itensor_->allocate(); }
     441              : 
     442          908 : void Tensor::deallocate() { itensor_->deallocate(); }
     443              : 
     444       246930 : bool Tensor::isAllocated() { return itensor_->isAllocated(); }
     445              : 
     446        81081 : void Tensor::setValue(float value) { itensor_->setValue(value); }
     447              : 
     448     35432469 : void Tensor::setValue(unsigned int b, unsigned int c, unsigned int h,
     449              :                       unsigned int w, float value) {
     450     35432469 :   itensor_->setValue(b, c, h, w, value);
     451     35432469 : }
     452              : 
     453        10925 : void Tensor::addValue(unsigned int b, unsigned int c, unsigned int h,
     454              :                       unsigned int w, float value, float beta) noexcept {
     455        10925 :   itensor_->addValue(b, c, h, w, value, beta);
     456        10925 : }
     457              : 
     458        36711 : void Tensor::setZero() { itensor_->setZero(); }
     459              : 
     460          108 : void Tensor::setRandNormal(float mean, float stddev) {
     461          108 :   itensor_->setRandNormal(mean, stddev);
     462          108 : }
     463              : 
     464        15807 : void Tensor::setRandUniform(float min, float max) {
     465        15807 :   itensor_->setRandUniform(min, max);
     466        15807 : }
     467              : 
     468            3 : void Tensor::setRandBernoulli(float probability) {
     469            3 :   itensor_->setRandBernoulli(probability);
     470            3 : }
     471              : 
     472        19183 : void Tensor::initialize() { itensor_->initialize(); }
     473              : 
     474           18 : void Tensor::initialize(Initializer init) { itensor_->initialize(init); }
     475              : 
     476            0 : Tensor Tensor::apply(std::function<Tensor(Tensor)> f) const { return f(*this); }
     477              : 
     478          883 : Tensor &Tensor::apply(std::function<Tensor &(Tensor, Tensor &)> f,
     479              :                       Tensor &output) const {
     480         1766 :   return f(*this, output);
     481              : }
     482              : 
     483         2846 : int Tensor::multiply_i_strided(Tensor const &m, const float beta) {
     484              :   try {
     485         2846 :     this->multiply_strided(m, *this, beta);
     486            2 :   } catch (std::exception &err) {
     487            2 :     ml_loge("%s %s", typeid(err).name(), err.what());
     488              :     return ML_ERROR_INVALID_PARAMETER;
     489            2 :   }
     490              : 
     491              :   return ML_ERROR_NONE;
     492              : }
     493              : 
     494           31 : Tensor Tensor::multiply_strided(Tensor const &m, const float beta) const {
     495           31 :   Tensor t("", getFormat(), getDataType());
     496           59 :   return this->multiply_strided(m, t, beta);
     497           31 : }
     498              : 
     499        12608 : Tensor &Tensor::multiply_strided(Tensor const &m, Tensor &output,
     500              :                                  const float beta) const {
     501        12608 :   itensor_->multiply_strided(m, output, beta);
     502        12602 :   return output;
     503              : }
     504              : 
     505         2608 : int Tensor::multiply_i(float const &value) {
     506         2608 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
     507            0 :     << getName() << " is not contiguous, cannot multiply";
     508              : 
     509         2608 :   return itensor_->multiply_i(value);
     510              : }
     511              : 
     512         6633 : Tensor Tensor::multiply(float const &value) const {
     513         6633 :   Tensor t("", getFormat(), getDataType());
     514        13266 :   return multiply(value, t);
     515         6633 : }
     516              : 
     517         7691 : Tensor &Tensor::multiply(float const &value, Tensor &out) const {
     518         7691 :   itensor_->multiply(value, out);
     519         7691 :   return out;
     520              : }
     521              : 
     522         8602 : int Tensor::multiply_i(Tensor const &m, const float beta) {
     523              :   try {
     524         8602 :     this->multiply(m, *this, beta);
     525          135 :   } catch (std::exception &err) {
     526          135 :     ml_loge("%s %s", typeid(err).name(), err.what());
     527              :     return ML_ERROR_INVALID_PARAMETER;
     528          135 :   }
     529              : 
     530              :   return ML_ERROR_NONE;
     531              : }
     532              : 
     533         3179 : Tensor Tensor::multiply(Tensor const &m, const float beta) const {
     534         3179 :   Tensor t("", getFormat(), getDataType());
     535         6353 :   return multiply(m, t, beta);
     536         3179 : }
     537              : 
     538        17216 : Tensor &Tensor::multiply(Tensor const &m, Tensor &output,
     539              :                          const float beta) const {
     540        17216 :   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
     541            0 :     << "Tensor Format of " << getName() << ":"
     542            0 :     << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
     543            0 :     << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
     544              : 
     545        17218 :   NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
     546              :                   !output.getContiguous(),
     547              :                 std::invalid_argument)
     548            2 :     << getName() << " is not contiguous, cannot multiply";
     549              : 
     550        17214 :   itensor_->multiply(m, output, beta);
     551        17075 :   return output;
     552              : }
     553              : 
     554         6237 : int Tensor::divide_i(float const &value) {
     555         6237 :   if (value == 0.0f) {
     556              :     return ML_ERROR_INVALID_PARAMETER;
     557              :   }
     558         6236 :   this->divide(value, *this);
     559         6236 :   return ML_ERROR_NONE;
     560              : }
     561              : 
     562            3 : Tensor Tensor::divide(float const &value) const {
     563            3 :   Tensor output("", getFormat(), getDataType());
     564            5 :   return divide(value, output);
     565            3 : }
     566              : 
     567         6274 : Tensor &Tensor::divide(float const &value, Tensor &output) const {
     568              :   /// @todo add unittest, ZeroDivisionError
     569         6274 :   if (value == 0.0f) {
     570            1 :     std::stringstream ss;
     571            1 :     ss << "[Tensor] divide by value failed, value: " << value;
     572            3 :     throw std::invalid_argument(ss.str().c_str());
     573            1 :   }
     574         6273 :   itensor_->divide(value, output);
     575         6273 :   return output;
     576              : }
     577              : 
     578          182 : int Tensor::divide_i(Tensor const &m) {
     579              :   try {
     580          182 :     this->divide(m, *this);
     581            5 :   } catch (std::exception &err) {
     582            5 :     ml_loge("%s %s", typeid(err).name(), err.what());
     583              :     return ML_ERROR_INVALID_PARAMETER;
     584            5 :   }
     585              : 
     586              :   return ML_ERROR_NONE;
     587              : }
     588              : 
     589           11 : Tensor Tensor::divide(Tensor const &m) const {
     590           11 :   Tensor output("", getFormat(), getDataType());
     591           18 :   return this->divide(m, output);
     592           11 : }
     593              : 
     594          200 : Tensor &Tensor::divide(Tensor const &m, Tensor &output) const {
     595          202 :   NNTR_THROW_IF(!getContiguous() || !m.getContiguous() ||
     596              :                   !output.getContiguous(),
     597              :                 std::invalid_argument)
     598            2 :     << getName() << " is not contiguous, cannot divide";
     599          198 :   itensor_->divide(m, output);
     600          190 :   return output;
     601              : }
     602              : 
     603          207 : int Tensor::add_i_strided(Tensor const &input, const float beta) {
     604              :   try {
     605          207 :     this->add_strided(input, *this, beta);
     606            0 :   } catch (std::exception &err) {
     607            0 :     ml_loge("%s %s", typeid(err).name(), err.what());
     608              :     return ML_ERROR_INVALID_PARAMETER;
     609            0 :   }
     610              : 
     611              :   return ML_ERROR_NONE;
     612              : }
     613              : 
     614            4 : Tensor Tensor::add_strided(Tensor const &input, const float beta) const {
     615            4 :   Tensor output("", getFormat(), getDataType());
     616            6 :   return this->add_strided(input, output, beta);
     617            4 : }
     618              : 
     619          212 : Tensor &Tensor::add_strided(Tensor const &input, Tensor &output,
     620              :                             const float beta) const {
     621          216 :   CREATE_IF_EMPTY_DIMS(output, getDim(), nullptr);
     622              : 
     623          212 :   if (size() != input.size() || size() != output.size())
     624              :     throw std::invalid_argument(
     625            1 :       "Strided addition does not support broadcasting");
     626              : 
     627          211 :   itensor_->add_strided(input, output, beta);
     628              : 
     629          209 :   return output;
     630              : }
     631              : 
     632         1231 : int Tensor::add_i(float const &value) {
     633         1231 :   this->add(value, *this);
     634         1231 :   return ML_ERROR_NONE;
     635              : }
     636              : 
     637         6179 : Tensor Tensor::add(float const &value) const {
     638         6179 :   Tensor t("", getFormat(), getDataType());
     639        12358 :   return add(value, t);
     640         6179 : }
     641              : 
     642         7432 : Tensor &Tensor::add(float const &value, Tensor &output) const {
     643         7432 :   itensor_->add(value, output);
     644         7432 :   return output;
     645              : }
     646              : 
     647        46231 : int Tensor::add_i(Tensor const &m, float const alpha) {
     648              :   try {
     649        46231 :     itensor_->add(m, *this, alpha);
     650           23 :   } catch (std::exception &err) {
     651           23 :     ml_loge("%s %s", typeid(err).name(), err.what());
     652              :     return ML_ERROR_INVALID_PARAMETER;
     653           23 :   }
     654              :   return ML_ERROR_NONE;
     655              : }
     656              : 
     657         2698 : int Tensor::add_i_partial(unsigned int len, unsigned int addr_idx, Tensor &m,
     658              :                           unsigned int incX, unsigned int incY,
     659              :                           const Tensor alphas, unsigned int alpha_idx) {
     660         2698 :   return itensor_->add_i_partial(len, addr_idx, m, incX, incY, alphas,
     661         2698 :                                  alpha_idx);
     662              : }
     663              : 
     664         5007 : Tensor Tensor::add(Tensor const &m, float const alpha) const {
     665         5007 :   Tensor t("", getFormat(), getDataType());
     666        10009 :   return this->add(m, t, alpha);
     667         5007 : }
     668              : 
     669        13158 : Tensor &Tensor::add(Tensor const &m, Tensor &output, float const alpha) const {
     670        13158 :   NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
     671            0 :     << "Tensor Format of " << getName() << ":"
     672            0 :     << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
     673            0 :     << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
     674              : 
     675        13162 :   NNTR_THROW_IF(!itensor_->getContiguous() || !m.getContiguous() ||
     676              :                   !output.getContiguous(),
     677              :                 std::invalid_argument)
     678            4 :     << getName() << " is not contiguous, cannot add";
     679        13154 :   itensor_->add(m, output, alpha);
     680        13146 :   return output;
     681              : }
     682              : 
     683          171 : int Tensor::subtract_i(float const &value) {
     684          171 :   this->subtract(value, *this);
     685          171 :   return ML_ERROR_NONE;
     686              : }
     687              : 
     688            3 : Tensor Tensor::subtract(float const &value) const {
     689            3 :   Tensor output("", getFormat(), getDataType());
     690            6 :   return subtract(value, output);
     691            3 : }
     692              : 
     693          174 : Tensor &Tensor::subtract(float const &value, Tensor &output) const {
     694          174 :   itensor_->subtract(value, output);
     695          174 :   return output;
     696              : }
     697              : 
     698         5698 : int Tensor::subtract_i(Tensor const &m) { return add_i(m, -1); }
     699              : 
     700         5866 : Tensor Tensor::subtract(Tensor const &m) const {
     701         5866 :   Tensor t("", getFormat(), getDataType());
     702        11727 :   return this->subtract(m, t);
     703         5866 : }
     704              : 
     705         8127 : Tensor &Tensor::subtract(Tensor const &m, Tensor &output) const {
     706         8127 :   return add(m, output, -1);
     707              : }
     708              : 
     709              : /**
     710              :  * This is to sum the Tensor data according to the dim.batch().
     711              :  * Therefore the result has M(dim.batch(), 1, 1, 1) dimension.
     712              :  */
     713          453 : Tensor Tensor::sum_by_batch() const {
     714          453 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
     715            0 :     << getName() << " is not contiguous, cannot sum";
     716              : 
     717          453 :   Tensor output(batch(), 1, 1, 1, this->getFormat(), getDataType());
     718          453 :   itensor_->sum_by_batch(output);
     719          453 :   return output;
     720            0 : }
     721              : 
     722        60390 : Tensor Tensor::sum(unsigned int axis, float alpha) const {
     723        60390 :   Tensor output("", this->getFormat(), this->getDataType());
     724       120777 :   return sum(axis, output, alpha, 0);
     725        60390 : }
     726              : 
     727       126666 : Tensor &Tensor::sum(unsigned int axis, Tensor &output, float alpha,
     728              :                     float beta) const {
     729       126666 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
     730            0 :     << getName() << " is not contiguous, cannot sum";
     731              : 
     732       126666 :   itensor_->sum(axis, output, alpha, beta);
     733       126663 :   return output;
     734              : }
     735              : 
     736        52083 : Tensor Tensor::sum(const std::vector<unsigned int> &axes, float alpha) const {
     737        52083 :   Tensor output("", this->getFormat());
     738       104165 :   return sum(axes, output, alpha);
     739        52083 : }
     740              : 
     741        59867 : Tensor &Tensor::sum(const std::vector<unsigned int> &axes, Tensor &output,
     742              :                     float alpha) const {
     743        59867 :   if (axes.empty())
     744            1 :     throw std::invalid_argument("empty axes given");
     745              : 
     746        59866 :   if (axes.size() == 1) {
     747          776 :     this->sum(axes[0], output, alpha);
     748              :   } else {
     749              : 
     750              :     /** club axes together */
     751        59090 :     Tensor new_reshaped = Tensor(getDim());
     752        59090 :     new_reshaped.copy(*this);
     753        59090 :     std::vector<unsigned int> continuous_order = {0, 3, 1, 2};
     754        59090 :     std::vector<unsigned int> new_axes = {axes[0]};
     755              : 
     756       229295 :     for (unsigned int i = 1; i < axes.size(); ++i) {
     757       170205 :       if (checkContinuous(axes[i - 1], axes[i])) {
     758       169889 :         new_reshaped.mergeAxis(axes[i - 1], axes[i]);
     759       169889 :         new_axes.back() = axes[i];
     760              :       } else {
     761          316 :         new_axes.push_back(axes[i]);
     762              :       }
     763              :     }
     764              : 
     765        59090 :     Tensor ret = new_reshaped.sum(new_axes[0]);
     766        59093 :     for (unsigned int i = 1; i < new_axes.size() - 1; ++i)
     767            6 :       ret = ret.sum(axes[i]);
     768        59090 :     ret.sum(new_axes.back(), output, alpha);
     769        59090 :   }
     770        59866 :   return output;
     771              : }
     772              : 
     773            0 : Tensor &Tensor::abs(Tensor &output) const {
     774            0 :   if (size() != output.size() || getDataType() != output.getDataType() ||
     775            0 :       getFormat() != output.getFormat())
     776              :     throw std::invalid_argument(
     777              :       "Error: Tensor::abs requires output tensor to be same size, data type "
     778            0 :       "and format as input tensor.");
     779            0 :   return itensor_->abs(output);
     780              : }
     781              : 
     782         5009 : Tensor Tensor::average(unsigned int axis) const {
     783         5009 :   Tensor output("", this->getFormat(), this->getDataType());
     784        10016 :   return average(axis, output);
     785         5009 : }
     786              : 
     787         5009 : Tensor &Tensor::average(unsigned int axis, Tensor &output) const {
     788         5009 :   if (axis >= TensorDim::MAXDIM)
     789              :     throw std::out_of_range(
     790            2 :       "negative axis or axis more then MAXDIM is invalid");
     791              : 
     792         5007 :   unsigned int axis_size = getDim()[axis];
     793         5007 :   if (axis_size == 1)
     794            0 :     output.copy(*this);
     795              :   else
     796         5007 :     this->sum(axis, output, 1.0 / ((float)axis_size));
     797              : 
     798         5007 :   return output;
     799              : }
     800              : 
     801            5 : Tensor Tensor::average(const std::vector<unsigned int> &axes) const {
     802            5 :   Tensor output("", this->getFormat(), this->getDataType());
     803            9 :   return average(axes, output);
     804            5 : }
     805              : 
     806          948 : Tensor &Tensor::average(const std::vector<unsigned int> &axes,
     807              :                         Tensor &output) const {
     808          948 :   if (axes.empty())
     809            0 :     return this->average(output);
     810              : 
     811          948 :   TensorDim ret_shape(getTensorType());
     812              : 
     813         2207 :   for (const auto &idx : axes) {
     814         1260 :     if (idx >= TensorDim::MAXDIM) {
     815            1 :       throw std::out_of_range("axis more then MAXDIM is invalid");
     816              :     }
     817         1259 :     ret_shape.setTensorDim(idx, getDim().getTensorDim(idx));
     818              :   }
     819              : 
     820          947 :   return this->sum(axes, output, 1.0 / (float)ret_shape.getDataLen());
     821              : }
     822              : 
     823         5003 : Tensor Tensor::average() const {
     824         5003 :   Tensor output = *this;
     825              :   unsigned int axis = 0;
     826         5003 :   if (this->getFormat() == Tformat::NHWC) {
     827            0 :     output.reshape({1, getDim().getDataLen(), 1, 1, this->getTensorType()});
     828              :     axis = 1;
     829              :   } else {
     830         5003 :     output.reshape({1, 1, 1, getDim().getDataLen(), this->getTensorType()});
     831              :     axis = 3;
     832              :   }
     833        10006 :   return output.average(axis);
     834         5003 : }
     835              : 
     836            0 : Tensor &Tensor::average(Tensor &output) const {
     837            0 :   Tensor result = *this;
     838            0 :   result.reshape({1, 1, 1, getDim().getDataLen()});
     839            0 :   return result.average(3, output);
     840            0 : }
     841              : 
     842           22 : int Tensor::pow_i(float exponent) {
     843           22 :   pow(exponent, *this);
     844           22 :   return ML_ERROR_NONE;
     845              : }
     846              : 
     847           13 : Tensor Tensor::pow(float exponent) const {
     848           13 :   Tensor output("", getFormat(), getDataType());
     849           26 :   return pow(exponent, output);
     850           13 : }
     851              : 
     852          790 : Tensor &Tensor::pow(float exponent, Tensor &output) const {
     853          790 :   itensor_->pow(exponent, output);
     854          790 :   return output;
     855              : }
     856              : 
     857            0 : int Tensor::sqrt_i() {
     858            0 :   this->sqrt(*this);
     859            0 :   return ML_ERROR_NONE;
     860              : }
     861              : 
     862            0 : Tensor Tensor::sqrt() const {
     863            0 :   Tensor output("", getFormat(), getDataType());
     864            0 :   return sqrt(output);
     865            0 : };
     866              : 
     867            6 : Tensor &Tensor::sqrt(Tensor &output) const {
     868           12 :   if (size() != output.size() || getDataType() != output.getDataType() ||
     869            6 :       getFormat() != output.getFormat())
     870              :     throw std::invalid_argument(
     871              :       "Error: Tensor::sqrt requires output tensor to be same size, data type "
     872            0 :       "and format as input tensor.");
     873              : 
     874            6 :   itensor_->sqrt(output);
     875            6 :   return output;
     876              : };
     877              : 
     878            0 : Tensor Tensor::neg() const {
     879            0 :   Tensor output("", getFormat(), getDataType());
     880            0 :   return neg(output);
     881            0 : };
     882              : 
     883            0 : Tensor &Tensor::neg(Tensor &output) const {
     884            0 :   if (size() != output.size() || getDataType() != output.getDataType() ||
     885            0 :       getFormat() != output.getFormat())
     886              :     throw std::invalid_argument(
     887              :       "Error: Tensor::sqrt requires output tensor to be same size, data type "
     888            0 :       "and format as input tensor.");
     889              : 
     890            0 :   itensor_->multiply(-1, output);
     891            0 :   return output;
     892              : };
     893              : 
     894            0 : int Tensor::erf_i() {
     895            0 :   erf(*this);
     896            0 :   return ML_ERROR_NONE;
     897              : }
     898              : 
     899            1 : Tensor Tensor::erf() const {
     900            1 :   Tensor output("", getFormat(), getDataType());
     901            2 :   return erf(output);
     902            1 : }
     903              : 
     904            1 : Tensor &Tensor::erf(Tensor &output) const {
     905            1 :   itensor_->erf(output);
     906            1 :   return output;
     907              : }
     908              : 
     909           12 : void Tensor::sin(Tensor &out, float alpha) const {
     910           12 :   if (size() != out.size())
     911            1 :     throw std::invalid_argument("Error: Size of out of Tensor::sin must match");
     912              : 
     913           11 :   itensor_->sin(out, alpha);
     914           11 : }
     915              : 
     916           14 : void Tensor::cos(Tensor &out, float alpha) const {
     917           14 :   if (size() != out.size())
     918            0 :     throw std::invalid_argument("Error: Size of out of Tensor::cos must match");
     919              : 
     920           14 :   itensor_->cos(out, alpha);
     921           14 : }
     922              : 
     923            6 : void Tensor::tan(Tensor &output, float alpha) const {
     924           12 :   if (size() != output.size() || getDataType() != output.getDataType() ||
     925            6 :       getFormat() != output.getFormat())
     926              :     throw std::invalid_argument(
     927              :       "Error: Tensor::abs requires output tensor to be same size, data type "
     928            0 :       "and format as input tensor.");
     929              : 
     930            6 :   itensor_->tan(output, alpha);
     931            6 : }
     932              : 
     933            1 : void Tensor::inv_sqrt_i() { itensor_->inv_sqrt(*this); }
     934              : 
     935            3 : Tensor Tensor::inv_sqrt(Tensor &out) const {
     936            3 :   itensor_->inv_sqrt(out);
     937            3 :   return out;
     938              : }
     939              : 
     940         5019 : LazyTensor Tensor::chain() const { return LazyTensor(*this); }
     941              : 
     942         1931 : float Tensor::l2norm() const { return itensor_->l2norm(); }
     943              : 
     944          167 : void Tensor::normalization_i() {
     945          167 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
     946            0 :     << getName() << " is not contiguous, cannot do normalization.";
     947              : 
     948          167 :   const float min = minValue();
     949          167 :   const float max = maxValue();
     950              : 
     951          167 :   if (max == min) {
     952            1 :     Tensor tmp = *this;
     953            1 :     this->subtract_i(tmp);
     954            1 :   } else {
     955          166 :     this->subtract_i(min);
     956          166 :     this->divide_i(max - min);
     957              :   }
     958          167 : }
     959              : 
     960            0 : void Tensor::standardization_i() {
     961            0 :   Tensor mean_by_batch = this->sum_by_batch();
     962            0 :   mean_by_batch.divide_i(static_cast<float>(getDim().getFeatureLen()));
     963              : 
     964            0 :   this->subtract_i(mean_by_batch);
     965            0 :   Tensor std_dev_by_batch(batch(), 1, 1, 1, getFormat(), getDataType());
     966            0 :   std_dev_by_batch.setZero();
     967              : 
     968              :   /// @todo remove conditional statement
     969            0 :   if (getDataType() == ml::train::TensorDim::DataType::FP32) {
     970              :     float *std_dev = std_dev_by_batch.getData<float>();
     971              : 
     972            0 :     for (unsigned int k = 0; k < batch(); ++k) {
     973            0 :       Tensor sub_this = this->getBatchSlice(k, 1);
     974            0 :       std_dev[k] = sub_this.l2norm();
     975            0 :     }
     976            0 :   } else if (getDataType() == ml::train::TensorDim::DataType::FP16) {
     977              : #ifdef ENABLE_FP16
     978              :     _FP16 *std_dev = std_dev_by_batch.getData<_FP16>();
     979              : 
     980              :     for (unsigned int k = 0; k < batch(); ++k) {
     981              :       Tensor sub_this = this->getBatchSlice(k, 1);
     982              :       std_dev[k] = static_cast<_FP16>(sub_this.l2norm());
     983              :     }
     984              : #else
     985            0 :     throw std::invalid_argument("Error: enable-fp16 is not enabled");
     986              : #endif
     987              :   }
     988              : 
     989            0 :   std_dev_by_batch.divide_i(static_cast<float>(getDim().getFeatureLen()));
     990            0 :   this->divide_i(std_dev_by_batch);
     991            0 : }
     992              : 
     993            0 : void Tensor::dot(std::vector<Tensor *> input, std::vector<Tensor *> output,
     994              :                  bool trans, bool trans_in, float beta) const {
     995            0 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
     996            0 :     << getName() << " is not contiguous. Cannot dot product.";
     997              : 
     998            0 :   itensor_->dot(input, output, trans, trans_in, beta);
     999            0 : }
    1000              : 
    1001         2079 : Tensor Tensor::dot(Tensor const &input, bool trans, bool trans_in) const {
    1002         2079 :   Tensor output("", getFormat(), getDataType());
    1003         2079 :   dot(input, output, trans, trans_in);
    1004              : 
    1005         2076 :   return output;
    1006            3 : }
    1007              : 
    1008              : /**
    1009              :  * @note: This dot product flattens the fist 3 axis for the purpose of
    1010              :  * computation. So, while performing, these matrices are behaving as 2-D
    1011              :  * matrices. The dimensions are restored while returning back the tensor
    1012              :  * in case of trans is false.
    1013              :  */
    1014        36830 : Tensor &Tensor::dot(Tensor const &input, Tensor &output, bool trans,
    1015              :                     bool trans_in, float beta) const {
    1016        36830 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1017            0 :     << getName() << " is not contiguous. Cannot dot product.";
    1018              : 
    1019        36830 :   itensor_->dot(input, output, trans, trans_in, beta);
    1020        36827 :   return output;
    1021              : }
    1022              : 
    1023         1457 : Tensor &Tensor::dot_deriv_wrt_1(Tensor const &m, Tensor const &output_deriv,
    1024              :                                 bool trans, bool trans_m, float beta) {
    1025              :   bool deriv_trans_m = true;
    1026              :   bool deriv_trans = false;
    1027              :   /** @todo handle all cases of trans and trans_m */
    1028         1457 :   if (!trans && trans_m) {
    1029              :     deriv_trans_m = false;
    1030              :   }
    1031              : 
    1032         1457 :   return output_deriv.dot(m, *this, deriv_trans, deriv_trans_m, beta);
    1033              : }
    1034              : 
    1035              : /**
    1036              :  * @brief compute the derivative wrt m in the m tensor
    1037              :  * @note The caller tensor must be the same tensor as the one which called the
    1038              :  * dot() product.
    1039              :  */
    1040         6636 : Tensor &Tensor::dot_deriv_wrt_2(Tensor &m_deriv, Tensor const &output_deriv,
    1041              :                                 bool trans, bool trans_m, float beta) const {
    1042              :   bool deriv_trans_m = false;
    1043              :   bool deriv_trans = true;
    1044              :   /** @todo handle all cases of trans and trans_m */
    1045              : 
    1046         6636 :   if (!trans && trans_m) {
    1047            0 :     output_deriv.dot(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
    1048            0 :     return m_deriv;
    1049              :   } else {
    1050         6636 :     return dot(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
    1051              :   }
    1052              : }
    1053              : 
    1054          759 : Tensor &Tensor::dotBatched(Tensor const &m, Tensor &result, bool trans,
    1055              :                            bool trans_m, float beta) const {
    1056          759 :   if (!result.isAllocated())
    1057              :     throw std::invalid_argument(
    1058            0 :       "Output tensor must be preallocated for dotBatched operation");
    1059              : 
    1060          759 :   size_t lcm = std::lcm(batch(), m.batch());
    1061          759 :   size_t group_size = lcm / batch();
    1062          759 :   size_t m_group_size = lcm / m.batch();
    1063              : 
    1064          761 :   NNTR_THROW_IF(!((lcm == batch() || lcm == m.batch())), std::invalid_argument)
    1065              :     << "The batch size of the given twon tensors must be the same"
    1066              :        "or the bigger one should be a multiple of the smaller one";
    1067              : 
    1068         4941 :   for (unsigned int b = 0; b < lcm; b++) {
    1069              :     /** @todo try using transpose to speedup the operation */
    1070         4184 :     const Tensor this_b = this->getBatchSlice(b / group_size, 1);
    1071         4184 :     Tensor m_b = m.getBatchSlice(b / m_group_size, 1);
    1072         4184 :     Tensor result_b = result.getBatchSlice(b, 1);
    1073              : 
    1074         4184 :     this_b.dot(m_b, result_b, trans, trans_m, beta);
    1075         4184 :   }
    1076              : 
    1077          757 :   return result;
    1078              : }
    1079              : 
    1080          176 : Tensor &Tensor::dot_batched_deriv_wrt_1(Tensor const &m,
    1081              :                                         Tensor const &output_deriv, bool trans,
    1082              :                                         bool trans_m, float beta) {
    1083              :   bool deriv_trans_m = true;
    1084              :   bool deriv_trans = false;
    1085              :   /** @todo handle all cases of trans and trans_m */
    1086          176 :   if (!trans && trans_m) {
    1087              :     deriv_trans_m = false;
    1088              :   }
    1089              : 
    1090          176 :   return output_deriv.dotBatched(m, *this, deriv_trans, deriv_trans_m, beta);
    1091              : }
    1092              : 
    1093          176 : Tensor &Tensor::dot_batched_deriv_wrt_2(Tensor &m_deriv,
    1094              :                                         Tensor const &output_deriv, bool trans,
    1095              :                                         bool trans_m, float beta) const {
    1096              :   bool deriv_trans_m = false;
    1097              :   bool deriv_trans = true;
    1098              :   /** @todo handle all cases of trans and trans_m */
    1099              : 
    1100          176 :   if (!trans && trans_m) {
    1101           88 :     output_deriv.dotBatched(*this, m_deriv, deriv_trans, deriv_trans_m, beta);
    1102           88 :     return m_deriv;
    1103              :   } else {
    1104           88 :     return dotBatched(output_deriv, m_deriv, deriv_trans, deriv_trans_m, beta);
    1105              :   }
    1106              : }
    1107              : 
    1108            0 : Tensor Tensor::dropout_mask(float dropout) const {
    1109            0 :   Tensor output(getDim());
    1110            0 :   output.dropout_mask(dropout);
    1111            0 :   return output;
    1112            0 : }
    1113              : 
    1114           10 : void Tensor::dropout_mask(float dropout) {
    1115              :   /// @todo add unittest
    1116           10 :   NNTR_THROW_IF(dropout < 0 || dropout > 1, std::invalid_argument)
    1117              :     << "[Tensor::dropout_mask] Dropout rate should be between 0 and 1";
    1118              : 
    1119              :   // if the rate is zero, no change is needed
    1120              :   if (std::fpclassify(dropout) == FP_ZERO)
    1121              :     return;
    1122              : 
    1123           10 :   setRandUniform(0.0, 1.0);
    1124           10 :   itensor_->dropout_mask(dropout);
    1125              : }
    1126              : 
    1127            0 : void Tensor::filter_mask(const Tensor &mask_len, bool reverse) {
    1128              :   /// @todo add unittest
    1129            0 :   itensor_->filter_mask(mask_len, reverse);
    1130            0 : }
    1131              : 
    1132            3 : Tensor Tensor::zoneout_mask(float zoneout) {
    1133            3 :   Tensor output(getDim());
    1134            3 :   zoneout_mask(output, zoneout);
    1135            3 :   return output;
    1136            0 : }
    1137              : 
    1138            4 : void Tensor::zoneout_mask(Tensor &opposite, float zoneout) {
    1139            5 :   NNTR_THROW_IF(getDim() != opposite.getDim(), std::invalid_argument)
    1140              :     << "[Tensor::zoneout_mask] opposite dimension does not match";
    1141              : 
    1142            3 :   NNTR_THROW_IF(zoneout < 0 || zoneout > 1, std::invalid_argument)
    1143              :     << "[Tensor::zoneout_mask] Zoneout rate should be between 0 and 1";
    1144              : 
    1145              :   // if the rate is zero, no change is needed
    1146              :   if (std::fpclassify(zoneout) == FP_ZERO)
    1147              :     return;
    1148              : 
    1149            3 :   itensor_->zoneout_mask(opposite, zoneout);
    1150              : }
    1151              : 
    1152            8 : std::vector<Tensor> Tensor::split(unsigned num_size, int axis) {
    1153           10 :   NNTR_THROW_IF(num_size == 0, std::invalid_argument)
    1154              :     << "num size cannot be zero";
    1155              : 
    1156            6 :   if (axis == -1) {
    1157              :     axis = 3;
    1158              :   }
    1159              : 
    1160            6 :   NNTR_THROW_IF(!(0 <= axis && axis < 4), std::invalid_argument)
    1161              :     << "cannot split axis of axis: " << axis;
    1162              : 
    1163            7 :   NNTR_THROW_IF(getDim().getTensorDim(axis) % num_size != 0,
    1164              :                 std::invalid_argument)
    1165              :     << "axis is not divisible by num_size, axis: " << axis
    1166              :     << " num size: " << num_size;
    1167              : 
    1168              :   std::vector<size_t> sizes;
    1169            5 :   sizes.resize(num_size);
    1170              : 
    1171            5 :   unsigned int sz = getDim().getTensorDim(axis) / num_size;
    1172              :   std::fill(sizes.begin(), sizes.end(), sz);
    1173              : 
    1174           10 :   return split(sizes, axis);
    1175            5 : }
    1176              : 
    1177           14 : std::vector<Tensor> Tensor::split(std::vector<size_t> sizes, int axis) {
    1178           14 :   NNTR_THROW_IF(sizes.size() == 0, std::invalid_argument)
    1179              :     << "num size cannot be zero";
    1180              : 
    1181           14 :   NNTR_THROW_IF(!(-1 <= axis && axis < 4), std::invalid_argument)
    1182              :     << "cannot split axis of axis: " << axis;
    1183              : 
    1184           15 :   NNTR_THROW_IF(
    1185              :     std::any_of(sizes.begin(), sizes.end(), [](size_t sz) { return !sz; }),
    1186              :     std::invalid_argument)
    1187              :     << "among given sizes at least one of size is 0";
    1188              : 
    1189           13 :   return itensor_->split(sizes, axis);
    1190              : }
    1191              : 
    1192            5 : Tensor Tensor::concat(const std::vector<Tensor> &tensors, int axis,
    1193              :                       Tensor &output) {
    1194            5 :   return itensor_->concat(tensors, axis, output);
    1195              : }
    1196              : 
    1197           11 : Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis) {
    1198           11 :   if (axis == -1) {
    1199              :     axis = 3;
    1200              :   }
    1201              : 
    1202              :   // Create an output tensor to store the concatenation result
    1203           11 :   TensorDim out_dim = Tensor::calculateConcatOutputDim(tensors, axis);
    1204            5 :   Tensor output = Tensor(out_dim);
    1205              : 
    1206           10 :   return output.concat(tensors, axis, output);
    1207            5 : }
    1208              : 
    1209            1 : Tensor Tensor::cat(const std::vector<Tensor> &tensors, int axis,
    1210              :                    Tensor &output) {
    1211            1 :   if (axis == -1) {
    1212              :     axis = 3;
    1213              :   }
    1214              : 
    1215              :   // Check if the given output tensor dimension is valid
    1216            1 :   TensorDim out_dim = Tensor::calculateConcatOutputDim(tensors, axis);
    1217              : 
    1218            1 :   NNTR_THROW_IF(out_dim != output.getDim(), std::invalid_argument)
    1219            2 :     << "invalid output dim for concatenation " << output.getDim()
    1220              :     << "expected output dim " << out_dim;
    1221              : 
    1222            0 :   return output.concat(tensors, axis, output);
    1223              : }
    1224              : 
    1225           11 : void Tensor::print(std::ostream &out) const {
    1226           11 :   printInstance(out, this);
    1227           11 :   itensor_->print(out);
    1228           11 : }
    1229              : 
    1230            0 : void Tensor::putData() const { itensor_->putData(); }
    1231              : 
    1232       138468 : void Tensor::setData(const std::shared_ptr<MemoryData> buf, size_t off,
    1233              :                      bool init) {
    1234       276936 :   itensor_->setMemoryData(buf, off);
    1235              : 
    1236       138468 :   if (buf && init) {
    1237        19084 :     initialize();
    1238              :   }
    1239       138468 : }
    1240              : 
    1241        34174 : const std::shared_ptr<MemoryData> Tensor::getMemoryData() const {
    1242        34174 :   return itensor_->getMemoryData();
    1243              : }
    1244              : 
    1245        34173 : size_t Tensor::getOffset() const { return itensor_->getOffset(); }
    1246              : 
    1247        97458 : void Tensor::copy(const Tensor &from) {
    1248              :   /// @todo enable copy to non-contiguous tensor
    1249        97458 :   if (!itensor_->getContiguous() || !from.getContiguous()) {
    1250            3 :     throw std::runtime_error("Cannot copy non-contiguous tensor");
    1251              :   }
    1252              : 
    1253       279418 :   if (from.size() != 0 && size() == from.size() &&
    1254       266471 :       scale_size() == from.scale_size() &&
    1255        84508 :       getDataType() == from.getDataType()) {
    1256              :     // if tensor size and data type match, copy data
    1257        84508 :     itensor_->copy(from);
    1258              :   } else {
    1259        12947 :     Tensor t = Tensor(from.getDim(), from.getData<char>());
    1260              :     swap(t, *this);
    1261        12947 :   }
    1262        97455 : }
    1263              : 
    1264        13734 : void Tensor::copyData(const Tensor &from) { itensor_->copyData(from); }
    1265              : 
    1266         3511 : void Tensor::copy_with_stride(const Tensor &from) {
    1267         3511 :   if (itensor_->getDim() == from.getDim()) {
    1268              :     // If the tensor dim matches, copy the data. This also applies to
    1269              :     // uncontigous tensor.
    1270            5 :     itensor_->copy_with_stride(from, *this);
    1271              :   } else {
    1272              :     // replace with a new tensor that has the same data as the given tensor
    1273         3506 :     Tensor t = Tensor(from.getDim(), true);
    1274         3506 :     itensor_->copy_with_stride(from, t);
    1275              :     swap(t, *this);
    1276         3506 :   }
    1277         3511 : }
    1278              : 
    1279       230846 : Tensor Tensor::getBatchSlice(size_t offset, unsigned int size) const {
    1280       230846 :   TensorDim dim_ = getDim();
    1281       230846 :   dim_.batch(size);
    1282              : 
    1283       461692 :   return getSharedDataTensor(dim_, offset * this->getDim().getFeatureLen(),
    1284       692536 :                              true, "");
    1285              : }
    1286              : 
    1287          107 : Tensor Tensor::getBatchSlice(const std::vector<unsigned int> &indices) const {
    1288              : 
    1289              :   // Validate tensor contiguity
    1290          107 :   NNTR_THROW_IF(!this->getContiguous(), std::runtime_error)
    1291              :     << "getBatchSlice requires contiguous tensor layer";
    1292              : 
    1293              :   // Validate indices vector is not empty
    1294          108 :   NNTR_THROW_IF(indices.empty(), std::invalid_argument)
    1295              :     << "Indices vector cannot be empty";
    1296              : 
    1297              :   // Validate indices
    1298          106 :   const unsigned batch_size = getDim().batch();
    1299          217 :   for (auto idx : indices) {
    1300          113 :     NNTR_THROW_IF(idx >= batch_size, std::out_of_range)
    1301              :       << "Batch index " << idx << " out of range [0," << batch_size << ")";
    1302              :   }
    1303              : 
    1304              :   // Get original tensor dimensions
    1305          105 :   const TensorDim &orig_dim = this->getDim();
    1306          105 :   const size_t element_size = orig_dim.getDataTypeSize();
    1307              : 
    1308              :   // Calculate single batch size in elements
    1309          105 :   const size_t single_batch_size = orig_dim.getFeatureLen();
    1310              : 
    1311              :   // Create output tensor with selected batches
    1312          105 :   TensorDim new_dim = orig_dim;
    1313          105 :   new_dim.batch(indices.size());
    1314          105 :   Tensor output(new_dim);
    1315              : 
    1316              :   // Validate output tensor size
    1317          105 :   const size_t output_bytes = output.bytes();
    1318          105 :   const size_t single_batch_bytes = single_batch_size * element_size;
    1319              : 
    1320              :   // Get raw data pointers
    1321              :   const unsigned char *src_data =
    1322              :     static_cast<const unsigned char *>(this->getData<unsigned char>());
    1323              :   unsigned char *dst_data =
    1324              :     static_cast<unsigned char *>(output.getData<void>());
    1325              : 
    1326              : // Parallel copy using OpenMP
    1327          105 : #pragma omp parallel for schedule(static)
    1328              :   for (int i = 0; i < static_cast<int>(indices.size()); ++i) {
    1329              :     const unsigned batch_idx = indices[i];
    1330              : 
    1331              :     // Calculate memory offsets
    1332              :     const size_t src_offset =
    1333              :       static_cast<size_t>(batch_idx) * single_batch_bytes;
    1334              :     const size_t dst_offset = static_cast<size_t>(i) * single_batch_bytes;
    1335              : 
    1336              :     // Bounds check for destination buffer
    1337              :     NNTR_THROW_IF(dst_offset + single_batch_bytes > output_bytes,
    1338              :                   std::runtime_error)
    1339              :       << "Destination buffer overflow detected";
    1340              : 
    1341              :     // Perform memory copy
    1342              :     std::memcpy(dst_data + dst_offset, src_data + src_offset,
    1343              :                 single_batch_bytes);
    1344              :   }
    1345              : 
    1346          105 :   return output;
    1347            0 : }
    1348              : 
    1349         7492 : Tensor Tensor::clone() const {
    1350        14984 :   Tensor output(getName(), getFormat(), getDataType());
    1351         7492 :   output.copy(*this);
    1352         7492 :   return output;
    1353            0 : }
    1354              : 
    1355         1086 : Tensor Tensor::clone(ml::train::TensorDim::DataType type) const {
    1356         1086 :   if (getDataType() == type)
    1357         1078 :     return clone();
    1358            8 :   TensorDim dim = getDim();
    1359              :   dim.setDataType(type);
    1360            8 :   Tensor output(dim, true);
    1361            8 :   output.copyData(*this);
    1362            8 :   output.setName(getName());
    1363              :   return output;
    1364            8 : }
    1365              : 
    1366            0 : void Tensor::readFSU() { itensor_->readFSU(); }
    1367              : 
    1368         2133 : void Tensor::save(std::ostream &file) {
    1369         2133 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1370            0 :     << getName() << " is not contiguous, cannot save.";
    1371              : 
    1372         2133 :   itensor_->save(file);
    1373         2133 : }
    1374              : 
    1375        23197 : void Tensor::read(std::ifstream &file, size_t start_offset,
    1376              :                   bool read_from_offset, int file_fd) {
    1377        23197 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1378            0 :     << getName() << " is not contiguous, cannot read.";
    1379              : 
    1380              :   // save the start_offset_info
    1381        23197 :   read_offset = start_offset;
    1382              : 
    1383              :   // Do not read now but save file_fd in tensor
    1384        23197 :   if (is_virtual) {
    1385            0 :     fd = file_fd;
    1386            0 :     return;
    1387              :   }
    1388              : 
    1389        23197 :   itensor_->read(file, start_offset, read_from_offset);
    1390              : }
    1391              : 
    1392            0 : void Tensor::read(ReadSource src, size_t start_offset, bool read_from_offset) {
    1393            0 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1394            0 :     << getName() << " is not contiguous, cannot read.";
    1395              : 
    1396            0 :   itensor_->read(src, start_offset, read_from_offset);
    1397            0 : }
    1398              : 
    1399          650 : std::vector<unsigned int> Tensor::argmax() const {
    1400          650 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1401            0 :     << getName() << " is not contiguous, cannot get argmax.";
    1402          650 :   return itensor_->argmax();
    1403              : }
    1404              : 
    1405            0 : std::vector<unsigned int> Tensor::argmin() const {
    1406            0 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1407            0 :     << getName() << " is not contiguous, cannot get argmin.";
    1408            0 :   return itensor_->argmin();
    1409              : }
    1410              : 
    1411            9 : std::pair<Tensor, Tensor> Tensor::topK(unsigned int k) const {
    1412              : 
    1413              :   // Create output tensor with modified W dimension
    1414            9 :   TensorDim output_dim = getDim();
    1415            9 :   TensorDim indices_dim = getDim();
    1416              :   Tformat format = output_dim.getFormat();
    1417              : 
    1418              :   // Validate k is within width dimension size
    1419            9 :   unsigned int width_size = output_dim.width();
    1420           11 :   NNTR_THROW_IF(k == 0 || k > width_size, std::invalid_argument)
    1421              :     << "k must be between 1 and width dimension size (" << width_size << ")";
    1422              : 
    1423              :   // Set new width dimension to k
    1424            7 :   output_dim.width(k);
    1425            7 :   indices_dim.width(k);
    1426              :   indices_dim.setDataType(Tdatatype::UINT32); // Set indices data type to UINT32
    1427              : 
    1428              :   // Create output tensor
    1429            7 :   Tensor output(output_dim);
    1430            7 :   output.allocate();
    1431            7 :   Tensor indices(indices_dim);
    1432            7 :   indices.allocate();
    1433              : 
    1434              :   // Prepare output buffer
    1435              :   void *output_data = output.getData<void>();
    1436              :   uint32_t *indices_data = indices.getData<uint32_t>();
    1437              : 
    1438              :   // Call TopK implementation
    1439            7 :   itensor_->topK(k, output_data, indices_data);
    1440              : 
    1441           12 :   return {output, indices};
    1442            8 : }
    1443              : 
    1444            7 : float Tensor::max_abs() const {
    1445            7 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1446            0 :     << getName() << " is not contiguous, cannot get max_abs.";
    1447            7 :   return itensor_->max_abs();
    1448              : }
    1449              : 
    1450          168 : float Tensor::maxValue() const { return itensor_->maxValue(); }
    1451              : 
    1452          175 : float Tensor::minValue() const { return itensor_->minValue(); }
    1453              : 
    1454         1107 : Tensor Tensor::transpose(const std::string &direction) const {
    1455         1107 :   Tensor output(getDim());
    1456         1107 :   transpose(direction, output);
    1457         1107 :   return output;
    1458            0 : }
    1459              : 
    1460         1114 : Tensor &Tensor::transpose(const std::string &direction, Tensor &output) const {
    1461         1114 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1462            0 :     << getName() << " is not contiguous. Cannot transpose.";
    1463              : 
    1464         1114 :   if (output.getData<char>() == getData<char>()) {
    1465            0 :     Tensor result = clone();
    1466            0 :     return result.transpose(direction, output);
    1467            0 :   }
    1468              : 
    1469         1114 :   itensor_->transpose(direction, output);
    1470              : 
    1471         1113 :   return output;
    1472              : }
    1473              : 
    1474        29183 : void Tensor::reshape(const TensorDim &d) { itensor_->reshape(d); }
    1475              : 
    1476         3261 : void Tensor::fill(const Tensor &from, bool allocate) {
    1477         3261 :   if (allocate && this->empty()) {
    1478            1 :     this->copy(from);
    1479            1 :     return;
    1480              :   }
    1481              : 
    1482         3260 :   if (!from.getContiguous() || !getContiguous()) {
    1483              :     /// @todo enable this if needed
    1484              :     throw nntrainer::exception::not_supported(
    1485            0 :       "[Tensor::fill] non-contiguous tensors are not supported");
    1486              :   }
    1487              : 
    1488         3260 :   if (getDim() != from.getDim()) {
    1489            2 :     throw std::invalid_argument("[Tensor::fill] dimension must be the same");
    1490              :   }
    1491              : 
    1492         3258 :   if (getStrides() != from.getStrides()) {
    1493              :     /// @todo length does not represent buffer size, there should be way to
    1494              :     /// get the buffer size
    1495            0 :     throw std::invalid_argument("[Tensor::fill] buffer size must be the same");
    1496              :   }
    1497              : 
    1498         3258 :   copyData(from);
    1499              : }
    1500              : 
    1501       876169 : TensorDim Tensor::getDim() const { return itensor_->getDim(); }
    1502              : 
    1503        15500 : TensorDim::TensorType Tensor::getTensorType() const {
    1504        15500 :   return itensor_->getTensorType();
    1505              : };
    1506              : 
    1507         1434 : Initializer Tensor::getInitializer() const {
    1508         1434 :   return itensor_->getInitializer();
    1509              : }
    1510              : 
    1511       421497 : TensorDim::Format Tensor::getFormat() const { return itensor_->getFormat(); }
    1512              : 
    1513       963682 : Tdatatype Tensor::getDataType() const { return itensor_->getDataType(); }
    1514              : 
    1515        20419 : void Tensor::updateBatch(unsigned int batch) { itensor_->updateBatch(batch); }
    1516              : 
    1517            0 : void Tensor::updateDimension(TensorDim dimension) {
    1518            0 :   itensor_->updateDimension(dimension);
    1519            0 : }
    1520              : 
    1521       679340 : const bool Tensor::getContiguous() const noexcept {
    1522       679340 :   return itensor_->getContiguous();
    1523              : }
    1524              : 
    1525              : const std::array<size_t, TensorDim::MAXDIM>
    1526        81265 : Tensor::getStrides() const noexcept {
    1527        81265 :   return itensor_->getStrides();
    1528              : }
    1529              : 
    1530       170205 : bool Tensor::checkContinuous(unsigned int np1, unsigned int np2) const {
    1531       170205 :   if (np1 > 3 || np2 > 3) {
    1532              :     throw std::invalid_argument(
    1533            0 :       "Error: Input value must be within the range of 0 to 3.");
    1534              :   }
    1535              : 
    1536       170205 :   if (getFormat() == Tformat::NCHW) {
    1537       170205 :     if (np1 + 1 == np2)
    1538              :       return true;
    1539              :   } else {
    1540            0 :     std::vector<unsigned int> continuous_order_nhwc = {0, 3, 1, 2};
    1541            0 :     if (continuous_order_nhwc[np2] == continuous_order_nhwc[np1] + 1)
    1542              :       return true;
    1543            0 :   }
    1544              : 
    1545              :   return false;
    1546              : }
    1547              : 
    1548            0 : void Tensor::setFileOffset(const size_t file_offset) {
    1549            0 :   itensor_->setFileOffset(file_offset);
    1550            0 : }
    1551              : 
    1552            0 : size_t Tensor::getFileOffset() const { return itensor_->getFileOffset(); }
    1553              : 
    1554         1199 : void Tensor::setName(const std::string &name_) { itensor_->setName(name_); }
    1555              : 
    1556        85744 : const std::string &Tensor::getName() const { return itensor_->getName(); }
    1557              : 
    1558     61724067 : size_t Tensor::getIndex(unsigned int b, unsigned int c, unsigned int h,
    1559              :                         unsigned int w) const noexcept {
    1560     61724067 :   return itensor_->getIndex(b, c, h, w);
    1561              : }
    1562              : 
    1563     19476946 : size_t Tensor::size() const { return itensor_->size(); }
    1564              : 
    1565       645013 : bool Tensor::empty() const { return itensor_->empty(); }
    1566              : 
    1567          129 : size_t Tensor::bytes() const { return itensor_->bytes(); }
    1568              : 
    1569        38188 : size_t Tensor::getMemoryBytes() const { return itensor_->getMemoryBytes(); }
    1570              : 
    1571       118275 : size_t Tensor::batch() const { return itensor_->batch(); }
    1572              : 
    1573        56588 : size_t Tensor::channel() const { return itensor_->channel(); }
    1574              : 
    1575        75988 : size_t Tensor::height() const { return itensor_->height(); }
    1576              : 
    1577       234536 : size_t Tensor::width() const { return itensor_->width(); }
    1578              : 
    1579       169020 : size_t Tensor::scale_size() const { return itensor_->scale_size(); }
    1580              : 
    1581            5 : QScheme Tensor::q_scheme() const { return itensor_->q_scheme(); }
    1582              : 
    1583       169889 : void Tensor::mergeAxis(unsigned int axis1, unsigned int axis2) {
    1584       169889 :   NNTR_THROW_IF(!getContiguous(), std::invalid_argument)
    1585            0 :     << getName() << " is not contiguous, cannot merge axis";
    1586              : 
    1587       169889 :   if (axis2 != axis1 + 1)
    1588            0 :     if (!checkContinuous(axis1, axis2))
    1589            0 :       throw std::invalid_argument("axis2 must be axis1 + 1 for merging.");
    1590              : 
    1591       169889 :   itensor_->mergeAxis(axis1, axis2);
    1592       169889 : }
    1593              : 
    1594            0 : void Tensor::createSharedDataTensor(const Tensor &src, Tensor &dest,
    1595              :                                     size_t offset) const {
    1596            0 :   itensor_->createSharedDataTensor(src.itensor_.get(), dest.itensor_.get(),
    1597              :                                    offset);
    1598            0 : }
    1599              : 
    1600       301856 : Tensor Tensor::getSharedDataTensor(const TensorDim dim_, size_t offset,
    1601              :                                    bool reset_stride,
    1602              :                                    const std::string &name_) const {
    1603       301856 :   Tensor ret = *this;
    1604       301856 :   itensor_->getSharedDataTensor(dim_, offset, reset_stride, name_,
    1605              :                                 ret.itensor_.get());
    1606       301854 :   return ret;
    1607            2 : }
    1608              : 
    1609            0 : void Tensor::activate() {
    1610              : 
    1611            0 :   NNTR_THROW_IF(!is_virtual, std::invalid_argument)
    1612              :     << "non-virtual tensor cannot call activate()";
    1613              : #if defined(_WIN32)
    1614              :   NNTR_THROW_IF(true, std::invalid_argument)
    1615              :     << "[Error/VirtualTensor] virtual tensor is not supported on Windows";
    1616              : #else
    1617              : 
    1618            0 :   auto file_offset = getFileOffset();
    1619            0 :   size_t off = (file_offset / 4096) * 4096;
    1620            0 :   size_t diff = file_offset - off;
    1621            0 :   size_t len = getMemoryBytes() + diff;
    1622              : 
    1623            0 :   mapped_ptr = mmap(NULL, len, PROT_READ, MAP_PRIVATE, this->fd, off);
    1624              : #ifdef __ANDROID__
    1625              :   madvise(mapped_ptr, len, MADV_WILLNEED);
    1626              : #endif
    1627            0 :   if (mapped_ptr == MAP_FAILED) {
    1628            0 :     std::cerr << "[activate] mmap failed: " << strerror(errno) << std::endl;
    1629              :   }
    1630            0 :   itensor_->activate((void *)&((uint8_t *)mapped_ptr)[diff]);
    1631              : #endif
    1632            0 : }
    1633              : 
    1634            0 : void Tensor::deactivate() {
    1635              : 
    1636            0 :   NNTR_THROW_IF(!is_virtual, std::invalid_argument)
    1637              :     << "non-virtual tensor cannot call deactivate()";
    1638              : #if defined(_WIN32)
    1639              :   NNTR_THROW_IF(true, std::invalid_argument)
    1640              :     << "[Error/VirtualTensor] virtual tensor is not supported on Windows";
    1641              : #else
    1642              : 
    1643            0 :   if (mapped_ptr == nullptr) {
    1644            0 :     return;
    1645              :   };
    1646              : 
    1647            0 :   auto file_offset = getFileOffset();
    1648              :   size_t off = (file_offset / 4096) * 4096;
    1649              :   size_t diff = file_offset - off;
    1650            0 :   size_t len = getMemoryBytes() + diff;
    1651              : 
    1652            0 :   auto ret_munmap = munmap((void *)mapped_ptr, len);
    1653              :   const size_t error_buflen = 100;
    1654              :   char error_buf[error_buflen];
    1655            0 :   NNTR_THROW_IF(ret_munmap == -1, std::runtime_error)
    1656              :     << "[deactivate] munmap failed: "
    1657            0 :     << SAFE_STRERROR(errno, error_buf, error_buflen);
    1658              : 
    1659            0 :   mapped_ptr = nullptr;
    1660            0 :   itensor_->deactivate();
    1661              : #endif
    1662              : }
    1663              : 
    1664        46136 : void Tensor::setTensorVar(TensorDim d, void *buf, size_t offset) {
    1665        46136 :   itensor_->setTensorVar(d, buf, offset);
    1666        46136 : }
    1667              : 
    1668           12 : TensorDim Tensor::calculateConcatOutputDim(const std::vector<Tensor> &tensors,
    1669              :                                            int axis) {
    1670              :   // Check axis, in which the tensors are concatenated, is valid.
    1671           12 :   NNTR_THROW_IF(!(-1 <= axis && axis < 4), std::invalid_argument)
    1672              :     << "cannot concatenate tensors along an axis: " << axis;
    1673              : 
    1674              :   // Check if the number of input tensors is valid.
    1675           14 :   NNTR_THROW_IF(tensors.size() <= 1, std::invalid_argument)
    1676              :     << "received an invalid tensor vector. size must be greater than 1.";
    1677              : 
    1678           10 :   auto out_dim = tensors.front().getDim();
    1679              : 
    1680              :   // Check if all tensor data types are the same.
    1681           30 :   for (auto &t : tensors) {
    1682           22 :     NNTR_THROW_IF(t.getDataType() != out_dim.getDataType(),
    1683              :                   std::invalid_argument)
    1684              :       << "cannot concatenate tensors with different data types.";
    1685              :   }
    1686              : 
    1687              :   // Compute the dimensions of an output tensor.
    1688            9 :   out_dim.setTensorDim(axis, 1);
    1689           31 :   NNTR_THROW_IF(!std::all_of(tensors.begin(), tensors.end(),
    1690              :                              [&out_dim, axis](const Tensor &t) {
    1691              :                                auto cur_dim = t.getDim();
    1692              :                                cur_dim.setTensorDim(axis, 1);
    1693              :                                return out_dim == cur_dim;
    1694              :                              }),
    1695              :                 std::invalid_argument)
    1696              :     << " all tensor must have the same dimension except for the axis, out_dim: "
    1697              :     << out_dim << " axis : " << axis;
    1698              : 
    1699            6 :   auto axis_dim = std::accumulate(tensors.begin(), tensors.end(), 0u,
    1700           13 :                                   [axis](unsigned cur, const Tensor &t) {
    1701           13 :                                     return cur += t.getDim().getTensorDim(axis);
    1702              :                                   });
    1703              : 
    1704            6 :   out_dim.setTensorDim(axis, axis_dim);
    1705            6 :   return out_dim;
    1706              : }
    1707              : 
    1708           10 : std::ostream &operator<<(std::ostream &out, Tensor const &input) {
    1709           10 :   input.print(out);
    1710           10 :   return out;
    1711              : }
    1712              : 
    1713              : } // namespace nntrainer
        

Generated by: LCOV version 2.0-1