LCOV - code coverage report
Current view: top level - nntrainer/tensor/cpu_backend/fallback - fallback_internal.cpp (source / functions) Coverage Total Hit
Test: coverage_filtered.info Lines: 65.0 % 274 178
Test Date: 2025-12-14 20:38:17 Functions: 53.8 % 65 35

            Line data    Source code
       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
       4              :  *
       5              :  * @file fallback_internal.cpp
       6              :  * @date   23 April 2024
       7              :  * @see    https://github.com/nnstreamer/nntrainer
       8              :  * @author Sungsik Kong <ss.kong@samsung.com>
       9              :  * @bug    No known bugs except for NYI items
      10              :  * @brief  Fallback computation functions (raw implementation)
      11              :  *
      12              :  */
      13              : 
      14              : #include <algorithm>
      15              : #include <assert.h>
      16              : #include <climits>
      17              : #include <cmath>
      18              : #include <cstdint>
      19              : #include <fallback_internal.h>
      20              : #include <limits>
      21              : #include <q4_0_utils.h>
      22              : #include <stdexcept>
      23              : #include <tensor_dim.h>
      24              : #include <util_func.h>
      25              : 
      26              : #define sgemv_loop(ci, cj, cM, cN)                                             \
      27              :   do {                                                                         \
      28              :     float y0;                                                                  \
      29              :     unsigned int i, j;                                                         \
      30              :     for (ci = 0; ci != cM; ci++) {                                             \
      31              :       y0 = 0.0f;                                                               \
      32              :       if (beta != 0.0f) {                                                      \
      33              :         y0 = Y[ci * incY] * beta;                                              \
      34              :       }                                                                        \
      35              :       for (cj = 0; cj != cN; cj++)                                             \
      36              :         y0 += A[i + j * lda] * X[cj * incX];                                   \
      37              :       Y[ci * incY] = y0;                                                       \
      38              :     }                                                                          \
      39              :   } while (0);
      40              : namespace nntrainer {
      41              : 
      42              : /**
      43              :  * @brief struct of q4_0x8 block
      44              :  */
      45              : struct block_q4_0x8 {
      46              :   uint16_t d[8];   // 16B
      47              :   uint8_t qs[128]; // 16 x u64
      48              : };
      49              : 
      50            3 : void __fallback_sscal(const unsigned int N, const float alpha, float *X,
      51              :                       const unsigned int incX) {
      52            3 :   assert(incX > 0);
      53           37 :   for (unsigned int i = 0; i < N; ++i)
      54           34 :     X[i * incX] = alpha * X[i * incX];
      55            3 : }
      56              : 
      57            3 : float __fallback_snrm2(const unsigned int N, const float *X,
      58              :                        const unsigned int incX) {
      59            3 :   assert(incX > 0);
      60              :   float sum = 0.0f;
      61              :   float tmp;
      62              : 
      63          110 :   for (unsigned int i = 0; i < N; i++) {
      64          107 :     tmp = X[i * incX];
      65          107 :     sum += tmp * tmp;
      66              :   }
      67            3 :   return sqrt(sum);
      68              : }
      69              : 
      70            2 : void __fallback_copy_s16_fp32(const unsigned int N, const int16_t *X,
      71              :                               float *Y) {
      72           22 :   for (unsigned int i = 0; i < N; ++i) {
      73           20 :     Y[i] = X[i];
      74              :   }
      75            2 : }
      76              : 
      77            1 : void __fallback_copy_u16_fp32(const unsigned int N, const uint16_t *X,
      78              :                               float *Y) {
      79            5 :   for (unsigned int i = 0; i < N; ++i) {
      80            4 :     Y[i] = X[i];
      81              :   }
      82            1 : }
      83              : 
      84            1 : void __fallback_copy_fp32_u32(const unsigned int N, const float *X,
      85              :                               uint32_t *Y) {
      86            5 :   for (unsigned int i = 0; i < N; ++i) {
      87            4 :     Y[i] = static_cast<uint32_t>(X[i]);
      88              :   }
      89            1 : }
      90              : 
      91            1 : void __fallback_copy_fp32_u16(const unsigned int N, const float *X,
      92              :                               uint16_t *Y) {
      93            5 :   for (unsigned int i = 0; i < N; ++i) {
      94            4 :     Y[i] = static_cast<uint16_t>(X[i]);
      95              :   }
      96            1 : }
      97              : 
      98            1 : void __fallback_copy_fp32_u8(const unsigned int N, const float *X, uint8_t *Y) {
      99            5 :   for (unsigned int i = 0; i < N; ++i) {
     100            4 :     Y[i] = static_cast<uint8_t>(X[i]);
     101              :   }
     102            1 : }
     103              : 
     104            1 : void __fallback_copy_fp32_s16(const unsigned int N, const float *X,
     105              :                               int16_t *Y) {
     106            5 :   for (unsigned int i = 0; i < N; ++i) {
     107            4 :     Y[i] = static_cast<int16_t>(X[i]);
     108              :   }
     109            1 : }
     110              : 
     111            3 : void __fallback_copy_fp32_s8(const unsigned int N, const float *X, int8_t *Y) {
     112         5281 :   for (unsigned int i = 0; i < N; ++i) {
     113         5278 :     Y[i] = static_cast<int8_t>(X[i]);
     114              :   }
     115            3 : }
     116              : 
     117            3 : void __fallback_copy_s16(const unsigned int N, const int16_t *X, int16_t *Y) {
     118          113 :   for (unsigned int i = 0; i < N; ++i) {
     119          110 :     Y[i] = X[i];
     120              :   }
     121            3 : }
     122              : 
     123            4 : void __fallback_copy_u16(const unsigned int N, const uint16_t *X, uint16_t *Y) {
     124          548 :   for (unsigned int i = 0; i < N; ++i) {
     125          544 :     Y[i] = X[i];
     126              :   }
     127            4 : }
     128              : 
     129            2 : void __fallback_scopy(const unsigned int N, const float *X,
     130              :                       const unsigned int incX, float *Y,
     131              :                       const unsigned int incY) {
     132            2 :   assert(incX > 0 && incY > 0);
     133           26 :   for (unsigned int i = 0; i < N; ++i)
     134           24 :     Y[i * incY] = X[i * incX];
     135            2 : }
     136              : 
     137            1 : void __fallback_scopy(const unsigned int N, const uint8_t *X,
     138              :                       const unsigned int incX, uint8_t *Y,
     139              :                       const unsigned int incY) {
     140           17 :   for (unsigned int idx = 0; idx < N; idx++) {
     141           16 :     Y[idx * incX] = X[idx * incY];
     142              :   }
     143            1 : }
     144              : 
     145           10 : void __fallback_scopy(const unsigned int N, const int8_t *X,
     146              :                       const unsigned int incX, int8_t *Y,
     147              :                       const unsigned int incY) {
     148          636 :   for (unsigned int idx = 0; idx < N; idx++) {
     149          626 :     Y[idx * incX] = X[idx * incY];
     150              :   }
     151           10 : }
     152              : 
     153            1 : void __fallback_scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
     154              :                                       const unsigned int incX, float *Y,
     155              :                                       const unsigned int incY) {
     156            5 :   for (unsigned int idx = 0; idx < N; idx++) {
     157            4 :     Y[2 * idx] = static_cast<float>(X[idx] >> 4);
     158            4 :     Y[2 * idx + 1] = static_cast<float>(X[idx] & 0x0f);
     159              :   }
     160            1 : }
     161              : 
     162              : /// @todo function with the same internal representation should be merged.
     163            2 : void __fallback_scopy_uint8_to_float32(const unsigned int N, const uint8_t *X,
     164              :                                        const unsigned int incX, float *Y,
     165              :                                        const unsigned int incY) {
     166           34 :   for (unsigned int idx = 0; idx < N; idx++) {
     167           32 :     Y[idx * incX] = X[idx * incY];
     168              :   }
     169            2 : }
     170              : 
     171            6 : void __fallback_scopy_int8_to_float32(const unsigned int N, const int8_t *X,
     172              :                                       const unsigned int incX, float *Y,
     173              :                                       const unsigned int incY) {
     174          212 :   for (unsigned int idx = 0; idx < N; idx++) {
     175          206 :     Y[idx * incX] = X[idx * incY];
     176              :   }
     177            6 : }
     178              : 
     179            3 : float __fallback_sdot(const unsigned int N, const float *X,
     180              :                       const unsigned int incX, const float *Y,
     181              :                       const unsigned int incY) {
     182              :   float ret = 0;
     183           14 :   for (unsigned int i = 0; i < N; ++i) {
     184           11 :     ret += X[i * incX] * Y[i * incY];
     185              :   }
     186            3 :   return ret;
     187              : }
     188              : 
     189            2 : void __fallback_saxpy(const unsigned int N, const float alpha, const float *X,
     190              :                       const unsigned int incX, float *Y,
     191              :                       const unsigned int incY) {
     192            2 :   assert(incX > 0 && incY > 0);
     193           10 :   for (unsigned int i = 0; i < N; ++i)
     194            8 :     Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
     195            2 : }
     196              : 
     197            4 : void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA,
     198              :                       bool TransB, const unsigned int M, const unsigned int N,
     199              :                       const unsigned int K, const float alpha, const float *A,
     200              :                       const unsigned int lda, const float *B,
     201              :                       const unsigned int ldb, const float beta, float *C,
     202              :                       const unsigned int ldc) {
     203           12 :   for (unsigned int m = 0; m < M; ++m) {
     204           24 :     for (unsigned int n = 0; n < N; ++n) {
     205              :       double c = 0.0;
     206           16 :       float c_old = C[m * ldc + n];
     207           48 :       for (unsigned int k = 0; k < K; ++k) {
     208              :         float a, b;
     209           32 :         a = ((TransA == true) ? A[k * lda + m] : A[m * lda + k]);
     210           32 :         b = ((TransB == true) ? B[n * ldb + k] : B[k * ldb + n]);
     211           32 :         c += a * b;
     212              :       }
     213           16 :       C[m * ldc + n] = alpha * c;
     214           16 :       if (beta != 0.0f) {
     215            4 :         C[m * ldc + n] += beta * c_old;
     216              :       }
     217              :     }
     218              :   }
     219            4 : }
     220              : 
     221            0 : void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA,
     222              :                       const unsigned int M, const unsigned int N,
     223              :                       const float alpha, const float *A, const unsigned int lda,
     224              :                       const float *X, const unsigned int incX, const float beta,
     225              :                       float *Y, const unsigned int incY) {
     226              : 
     227            0 :   if (TransA == true) {
     228            0 :     sgemv_loop(i, j, N, M);
     229              :   } else {
     230            0 :     sgemv_loop(j, i, M, N);
     231              :   }
     232            0 : }
     233              : 
     234            3 : unsigned int __fallback_isamax(const unsigned int N, const float *X,
     235              :                                const unsigned int incX) {
     236              :   unsigned int max_idx = 0;
     237            3 :   float max_val = X[0];
     238           12 :   for (unsigned int n = 1; n < N; n += incX) {
     239            9 :     float cur_val = std::abs(X[n]);
     240            9 :     if (cur_val > max_val) {
     241              :       max_val = cur_val;
     242              :       max_idx = n;
     243              :     }
     244              :   }
     245              : 
     246            3 :   return max_idx;
     247              : }
     248              : 
     249              : template <>
     250           12 : void __fallback_sine(const unsigned int N, float *X, float *Y, float alpha,
     251              :                      float beta) {
     252              :   unsigned int i = 0;
     253      2073635 :   while (i < N) {
     254      2073623 :     Y[i] = std::sin(alpha * X[i]) * beta;
     255      2073623 :     ++i;
     256              :   }
     257           12 : }
     258              : 
     259              : template <>
     260           14 : void __fallback_cosine(const unsigned int N, float *X, float *Y, float alpha,
     261              :                        float beta) {
     262              :   unsigned int i = 0;
     263      2073642 :   while (i < N) {
     264      2073628 :     Y[i] = std::cos(alpha * X[i]) * beta;
     265      2073628 :     ++i;
     266              :   }
     267           14 : }
     268              : 
     269            1 : void __fallback_inv_sqrt_inplace(const unsigned int N, float *X) {
     270            5 :   for (unsigned int i = 0; i < N; ++i) {
     271            4 :     X[i] = 1 / std::sqrt(static_cast<float>(X[i]));
     272              :   }
     273            1 : }
     274              : 
     275           65 : void __fallback_ele_mul(const unsigned int N, const float *X, const float *Y,
     276              :                         float *Z, float alpha, float beta,
     277              :                         unsigned int i_stride, unsigned int o_stride) {
     278       193609 :   for (unsigned int i = 0; i < N; ++i) {
     279       193544 :     *Z = *X * alpha * *Y + ((0.0f == beta) ? 0.0f : beta * *Z);
     280       193544 :     X += o_stride;
     281       193544 :     Y += i_stride;
     282       193544 :     Z += o_stride;
     283              :   }
     284           65 : }
     285              : 
     286           64 : void __fallback_ele_add(const unsigned int N, const float *X, const float *Y,
     287              :                         float *Z, float alpha, float beta,
     288              :                         unsigned int i_stride, unsigned int o_stride) {
     289       193604 :   for (unsigned int i = 0; i < N; ++i) {
     290       193540 :     *Z = *X + alpha * *Y + ((0.0f == beta) ? 0.0f : beta * *Z);
     291       193540 :     X += o_stride;
     292       193540 :     Y += i_stride;
     293       193540 :     Z += o_stride;
     294              :   }
     295           64 : }
     296              : 
     297            1 : void __fallback_ele_sub(const unsigned N, const float *X, const float *Y,
     298              :                         float *Z, float alpha, float beta,
     299              :                         unsigned int i_stride, unsigned int o_stride) {
     300            5 :   for (unsigned int i = 0; i < N; ++i) {
     301            4 :     *Z = *X - alpha * *Y + ((0.0f == beta) ? 0.0f : beta * *Z);
     302            4 :     X += o_stride;
     303            4 :     Y += i_stride;
     304            4 :     Z += o_stride;
     305              :   }
     306            1 : }
     307              : 
     308          334 : void __fallback_ele_div(const unsigned N, const float *X, const float *Y,
     309              :                         float *Z, float alpha, float beta,
     310              :                         unsigned int i_stride, unsigned int o_stride) {
     311         3667 :   for (unsigned int i = 0; i < N; ++i) {
     312         3333 :     *Z = *X / (alpha * *Y) + ((0.0f == beta) ? 0.0f : beta * *Z);
     313         3333 :     X += o_stride;
     314         3333 :     Y += i_stride;
     315         3333 :     Z += o_stride;
     316              :   }
     317          334 : }
     318              : 
     319            2 : void __fallback_transpose_matrix(const unsigned int M, const unsigned int N,
     320              :                                  const float *src, unsigned int ld_src,
     321              :                                  float *dst, unsigned int ld_dst) {
     322            7 :   for (unsigned int i = 0; i < M; i++) {
     323           20 :     for (unsigned int j = 0; j < N; j++) {
     324           15 :       dst[i + j * ld_dst] = src[i * ld_src + j];
     325              :     }
     326              :   }
     327            2 : }
     328              : 
     329            3 : bool __fallback_isValid(const unsigned int N, const float *X) {
     330            9 :   for (size_t i = 0; i < N; ++i) {
     331            8 :     if (!isFloatValid(*X)) {
     332              :       return false;
     333              :     }
     334            6 :     ++X;
     335              :   }
     336              : 
     337              :   return true;
     338              : }
     339              : 
     340            0 : void __fallback_unpack_q4_0x8_transpose16(const void *src,
     341              :                                           uint16_t *__restrict dT,
     342              :                                           uint16_t *__restrict qsT, int N,
     343              :                                           int K, int CT) {
     344              :   const auto *x = static_cast<const block_q4_0x8 *>(src);
     345              : 
     346            0 :   const int groups_N8 = N / 8;    // # of 8-row groups
     347            0 :   const int cols_scales = K / 32; // # subblocks along K (scales columns)
     348              :   const uint64_t mask = 0x8888888888888888ULL; // flip MSB of each nibble
     349              : 
     350              :   // Tile over columns to keep working set small.
     351            0 :   for (int c0 = 0; c0 < cols_scales; c0 += CT) {
     352            0 :     const int c1 = std::min(c0 + CT, cols_scales);
     353              : 
     354              :     // Process rows in natural 8-row groups for source-friendly access
     355            0 :     for (int b = 0; b < groups_N8; ++b) {
     356              :       // For each column in the tile, read the source block contiguously
     357            0 :       for (int c = c0; c < c1; ++c) {
     358            0 :         const block_q4_0x8 &blk = x[b * cols_scales + c];
     359              : 
     360              :         // Precompute column bases in the transposed outputs
     361            0 :         unsigned short *__restrict dT_c = dT + c * N; // column c in dT
     362              :         unsigned short *__restrict qsT_c0 =
     363            0 :           qsT + (c * 8) * N; // first of 8 columns for this subblock
     364              : 
     365              :         // Walk the 8 rows inside this block group
     366            0 :         for (int off = 0; off < 8; ++off) {
     367            0 :           const int r = b * 8 + off; // absolute row index in [0..N-1]
     368              : 
     369              :           // ---------- SCALES (fp16), transposed on the fly ----------
     370            0 :           dT_c[r] = blk.d[off];
     371              : 
     372              :           // ---------- QUANTS (bytes → XOR → swizzle → 8×u16), transposed
     373              :           // ---------- load two u64 chunks for this row
     374              :           uint64_t v0, v1;
     375            0 :           std::memcpy(&v0, blk.qs + 8 * off, 8);
     376            0 :           std::memcpy(&v1, blk.qs + 8 * (off + 8), 8);
     377            0 :           v0 ^= mask;
     378            0 :           v1 ^= mask;
     379              : 
     380              :           unsigned char in[16];
     381              :           std::memcpy(in + 0, &v0, 8);
     382              :           std::memcpy(in + 8, &v1, 8);
     383              : 
     384              :           // nibble-lane swizzle (identical to your reference)
     385              :           unsigned char out[16];
     386            0 :           for (int i = 0; i < 8; ++i) {
     387            0 :             const unsigned char x0 = in[2 * i + 0];
     388            0 :             const unsigned char x1 = in[2 * i + 1];
     389            0 :             out[i + 0] = (unsigned char)((x0 & 0x0F) | ((x1 & 0x0F) << 4));
     390            0 :             out[i + 8] = (unsigned char)(((x0 & 0xF0) >> 4) | (x1 & 0xF0));
     391              :           }
     392              : 
     393              :           // pack to 8×u16 and store to transposed columns j = c*8 .. c*8+7 at
     394              :           // row r
     395            0 :           for (int t = 0; t < 8; ++t) {
     396            0 :             const unsigned short w =
     397            0 :               (unsigned short)((unsigned short)out[2 * t + 0] |
     398            0 :                                ((unsigned short)out[2 * t + 1] << 8));
     399            0 :             qsT_c0[t * N + r] = w; // column (c*8 + t), row r
     400              :           }
     401              :         } // off
     402              :       }   // c in tile
     403              :     }     // b
     404              :   }       // c0 tiles
     405            0 : }
     406              : 
     407              : template <>
     408            0 : void __fallback_calc_trigonometric_vals_dup(unsigned int N_half, float *angle,
     409              :                                             float *cos_, float *sin_,
     410              :                                             unsigned int from,
     411              :                                             float attention_scaling) {
     412              :   throw std::runtime_error(
     413              :     "Error: No implementation of rotary embedding layer incremental_forwarding "
     414            0 :     "with SIMD acceleration except for NEON!");
     415              : }
     416              : 
     417            1 : void __fallback_swiglu(const unsigned int N, float *X, float *Y, float *Z) {
     418              :   unsigned int i = 0;
     419            5 :   while (i < N) {
     420            4 :     X[i] = (Y[i] / (1.f + std::exp(-Y[i]))) * Z[i];
     421            4 :     ++i;
     422              :   }
     423            1 : }
     424              : 
     425            1 : void __fallback_swiglu(const unsigned int N, float *X, float *Y, float *Z,
     426              :                        float alpha) {
     427              :   unsigned int i = 0;
     428            5 :   while (i < N) {
     429            4 :     X[i] = (Y[i] / (1.f + std::exp(-alpha * Y[i]))) * Z[i];
     430            4 :     ++i;
     431              :   }
     432            1 : }
     433              : 
     434            4 : float __fallback_max(const unsigned int N, float *X) {
     435            4 :   std::vector<float> v(X, X + N);
     436            8 :   return *std::max_element(v.begin(), v.end());
     437            4 : }
     438              : 
     439            2 : void __fallback_softmax(const unsigned int N, float *X, float *Y) {
     440              :   unsigned int i = 0;
     441              :   float sum = 0.f;
     442            2 :   float max_x = __fallback_max(N, X);
     443           10 :   while (i < N) {
     444            8 :     sum += std::exp(X[i] - max_x);
     445            8 :     ++i;
     446              :   }
     447              :   i = 0;
     448           10 :   while (i < N) {
     449            8 :     Y[i] = std::exp(X[i] - max_x) / sum;
     450            8 :     ++i;
     451              :   }
     452            2 : }
     453              : 
     454              : template <>
     455            0 : void __fallback_gemm_q4_0(const unsigned int M, const unsigned int N,
     456              :                           const unsigned int K, const float *A,
     457              :                           const unsigned int lda, const void *B,
     458              :                           const unsigned int ldb, float *C,
     459              :                           const unsigned int ldc) {
     460            0 :   throw std::runtime_error("NYI : __fallback_gemm_q4_0");
     461              : }
     462              : 
     463            0 : void __fallback_gemm_q4_K(const unsigned int M, const unsigned int N,
     464              :                           const unsigned int K, const float *A,
     465              :                           const unsigned int lda, const void *B,
     466              :                           const unsigned int ldb, float *C,
     467              :                           const unsigned int ldc) {
     468            0 :   throw std::runtime_error("NYI : __fallback_gemm_q4_K");
     469              : }
     470              : 
     471            0 : float __fallback_dot_q6_K_q8_K(const unsigned int K, const void *v_q6_K,
     472              :                                const void *v_q8_K) {
     473            0 :   throw std::runtime_error("NYI : __fallback_dot_q6_K_q8_K");
     474              :   return 0;
     475              : }
     476              : 
     477            0 : float __fallback_dot_q6_K_f32(const unsigned int K, const void *v_q6_K,
     478              :                               const float *f) {
     479            0 :   throw std::runtime_error("NYI : __fallback_dot_q6_K_f32");
     480              :   return 0;
     481              : }
     482              : 
     483              : template <>
     484            0 : void __fallback_gemm_q6_K(const unsigned int M, const unsigned int N,
     485              :                           const unsigned int K, const float *A,
     486              :                           const unsigned int lda, const void *B,
     487              :                           const unsigned int ldb, float *C,
     488              :                           const unsigned int ldc) {
     489            0 :   throw std::runtime_error("NYI : __fallback_gemm_q6_K");
     490              : }
     491              : 
     492            0 : size_t __fallback_quantize_q4_0(const float *src, void *dst, int64_t nrow,
     493              :                                 int64_t n_per_row, const float *quant_weights) {
     494            0 :   throw std::runtime_error("NYI : __fallback_quantize_q4_0");
     495              :   return 1;
     496              : }
     497              : 
     498            0 : size_t __fallback_quantize_q4_K(const float *src, void *dst, int64_t nrow,
     499              :                                 int64_t n_per_row, const float *quant_weights) {
     500            0 :   throw std::runtime_error("NYI : __fallback_quantize_q4_K");
     501              :   return 1;
     502              : }
     503              : 
     504            0 : size_t __fallback_quantize_q6_K(const float *src, void *dst, int64_t nrow,
     505              :                                 int64_t n_per_row, const float *quant_weights) {
     506            0 :   throw std::runtime_error("NYI : __fallback_quantize_q4_K");
     507              :   return 1;
     508              : }
     509              : 
     510            0 : void __fallback_dequantize_row_q4_K(const void *x_raw, float *y, int64_t k) {
     511            0 :   throw std::runtime_error("NYI : __fallback_dequantize_row_q4_K");
     512              : }
     513              : 
     514            0 : void __fallback_dequantize_row_q4_0(const void *x_raw, float *y, int64_t k) {
     515            0 :   throw std::runtime_error("NYI : __fallback_dequantize_row_q4_0");
     516              : }
     517              : 
     518            0 : void __fallback_dequantize_row_q6_K(const void *x, float *y, int64_t k) {
     519            0 :   throw std::runtime_error("NYI : __fallback_dequantize_row_q6_K");
     520              : }
     521              : 
     522            0 : void __fallback_quantize_row_q6_K(const float *src, void *dst, int64_t k) {
     523            0 :   throw std::runtime_error("NYI : __fallback_quantize_row_q6_K");
     524              : }
     525              : 
     526              : template <>
     527            0 : void __fallback_quantize_row_q8_K(const float *src, void *dst, int64_t k) {
     528            0 :   throw std::runtime_error("NYI : __fallback_quantize_row_q8_K");
     529              : }
     530              : 
     531              : template <>
     532            0 : void __fallback_dequantize_row_q8_K(const void *x, float *y, int64_t k) {
     533            0 :   throw std::runtime_error("NYI : __fallback_dequantize_row_q8_K");
     534              : }
     535              : 
     536            0 : void __fallback_repack_q4_0_to_q4_0_4(void *W, void *repacked_W,
     537              :                                       size_t data_size, const unsigned int M,
     538              :                                       const unsigned int N) {
     539            0 :   throw std::runtime_error("NYI : __fallback_repack_q4_0_to_q4_0_4");
     540              : }
     541              : 
     542            0 : void __fallback_repack_q4_0_to_q4_0_8(void *W, void *repacked_W,
     543              :                                       size_t data_size, const unsigned int M,
     544              :                                       const unsigned int N) {
     545            0 :   throw std::runtime_error("NYI : __fallback_repack_q4_0_to_q4_0_8");
     546              : }
     547              : 
     548            0 : void __fallback_repack_q4_K_to_q4_K_8(void *W, void *repacked_W,
     549              :                                       size_t data_size, const unsigned int M,
     550              :                                       const unsigned int N) {
     551            0 :   throw std::runtime_error("NYI : __fallback_repack_q4_K_to_q4_K_8");
     552              : }
     553              : 
     554            0 : void __fallback_unpack_q4_0_8_to_q4_0(const void *in_q4_0x, void *out_q4_0,
     555              :                                       size_t data_size, const unsigned int M,
     556              :                                       const unsigned int N) {
     557            0 :   throw std::runtime_error("NYI : __fallback_unpack_q4_0_8_to_q4_0");
     558              : }
     559              : 
     560            0 : void __fallback_softmax_row_inplace(float *qk_out, size_t start_row,
     561              :                                     size_t end_row, size_t num_heads) {
     562            0 :   throw std::runtime_error("NYI : __fallback_softmax_row_inplace");
     563              : }
     564              : 
     565            0 : void __fallback_softmax_row(float *qk_out, size_t start_row, size_t end_row,
     566              :                             size_t num_heads) {
     567            0 :   throw std::runtime_error("NYI : __fallback_softmax_row");
     568              : }
     569              : 
     570            0 : void __fallback_compute_fp16vcache_fp32_transposed(
     571              :   int row_num, const float *in, const uint16_t *vcache, float *output,
     572              :   int num_cache_head, int gqa_size, int head_dim, size_t local_window_size) {
     573              :   throw std::runtime_error(
     574            0 :     "NYI : __fallback_compute_fp16vcache_fp32_transposed");
     575              : }
     576              : 
     577              : template <>
     578            0 : void __fallback_compute_kcaches(const float *in, const uint16_t *kcache,
     579              :                                 float *output, int num_rows, int num_cache_head,
     580              :                                 int head_dim, int gqa_size, int tile_size,
     581              :                                 size_t local_window_size) {
     582            0 :   throw std::runtime_error("NYI : __fallback_compute_kcaches");
     583              : }
     584              : 
     585            0 : void __fallback_compute_rotary_emb_value(unsigned int width, unsigned int dim,
     586              :                                          unsigned int half_, float *inout,
     587              :                                          void *output, const float *cos_,
     588              :                                          const float *sin_,
     589              :                                          bool only_convert_to_fp16) {
     590            0 :   throw std::runtime_error("NYI : __fallback_compute_rotary_emb_value");
     591              : }
     592              : 
     593            0 : void __fallback_rms_norm_wrt_width_fp32_intrinsic(const float *__restrict X,
     594              :                                                   float *__restrict Y, size_t H,
     595              :                                                   size_t W, float epsilon) {
     596              :   throw std::runtime_error(
     597            0 :     "NYI : __fallback_rms_norm_wrt_width_fp32_intrinsic");
     598              : }
     599              : 
     600              : template <>
     601            0 : void __fallback_rms_norm_wrt_width_fp16_intrinsic(const float *__restrict X,
     602              :                                                   float *__restrict Y, size_t H,
     603              :                                                   size_t W, float epsilon) {
     604              :   throw std::runtime_error(
     605            0 :     "NYI : __fallback_rms_norm_wrt_width_fp16_intrinsic");
     606              : }
     607              : 
     608              : template <>
     609           22 : void __fallback_clamp(const float *input, float *output, size_t length,
     610              :                       float lower_bound, float upper_bound) {
     611        64540 :   for (int i = 0; i < length; ++i) {
     612       129036 :     output[i] = std::clamp(input[i], lower_bound, upper_bound);
     613              :   }
     614           22 : }
     615              : 
     616            0 : void __fallback_create_q4_0_weights(const uint8_t *int4_weight,
     617              :                                     uint8_t *q4_0_weight) {
     618            0 :   for (int i = 0; i < 8; i++) {
     619            0 :     char v0 = int4_weight[i] & 0xF;
     620            0 :     char v1 = (int4_weight[i] >> 4) & 0xF;
     621            0 :     char v2 = int4_weight[8 + i] & 0xF;
     622              :     char v3 = (int4_weight[8 + i] >> 4) & 0xF;
     623            0 :     q4_0_weight[2 * i] = (v0 | (v2 << 4));
     624            0 :     q4_0_weight[2 * i + 1] = (v1 | (v3 << 4));
     625              :   }
     626            0 : }
     627              : 
     628            0 : void __fallback_transform_q4_0x_from_int4(size_t N, size_t K,
     629              :                                           const uint8_t *osv32_weights,
     630              :                                           const uint16_t *osv32_scales,
     631              :                                           size_t scale_group_size,
     632              :                                           void *dst_q4_0x) {
     633            0 :   Q4_0Utils::transformQ4_0x_FromInt4(N, K, osv32_weights, osv32_scales,
     634              :                                      scale_group_size, 8, dst_q4_0x);
     635            0 : }
     636              : 
     637              : } // namespace nntrainer
        

Generated by: LCOV version 2.0-1