LCOV - code coverage report
Current view: top level - nntrainer/tensor/cpu_backend/x86 - x86_compute_backend.cpp (source / functions) Coverage Total Hit
Test: coverage_filtered.info Lines: 64.4 % 202 130
Test Date: 2026-01-12 20:43:37 Functions: 64.4 % 73 47

            Line data    Source code
       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
       4              :  *
       5              :  * @file   x86_compute_backend.cpp
       6              :  * @date   23 April 2024
       7              :  * @see    https://github.com/nntrainer/nntrainer
       8              :  * @author Sungsik Kong <ss.kong@samsung.com>
       9              :  * @bug    No known bugs except for NYI items
      10              :  * @brief  Compute backend for x86
      11              :  *
      12              :  */
      13              : 
      14              : #include <assert.h>
      15              : 
      16              : #include <avx2_impl.h>
      17              : #ifdef USE_BLAS
      18              : #include <cblas_interface.h>
      19              : #endif
      20              : #include <fallback_internal.h>
      21              : #include <ggml_interface.h>
      22              : #include <nntrainer_error.h>
      23              : #include <q4_0_utils.h>
      24              : #include <x86_compute_backend.h>
      25              : 
      26              : #define ROW_MAJOR 0
      27              : #define COL_MAJOR 1
      28              : 
      29              : namespace nntrainer {
      30              : 
      31           61 : void init_backend() { __ggml_init();
      32              :   // Do not repeatedly call set_num_threads. It's a global config.
      33           61 :   __openblas_set_num_threads(-1); // -1 = BLAS_NUM_THREADS if defined.
      34           61 : }
      35              : 
      36            0 : void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
      37              :                            const unsigned int incX, float *Y,
      38              :                            const unsigned int incY) {
      39            0 :   __fallback_scopy_int4_to_float32(N, X, incX, Y, incY);
      40            0 : }
      41              : 
      42            2 : void copy_s16(const unsigned int N, const int16_t *X, int16_t *Y) {
      43            2 :   __fallback_copy_s16(N, X, Y);
      44            2 : }
      45              : 
      46            3 : void copy_u16(const unsigned int N, const uint16_t *X, uint16_t *Y) {
      47            3 :   __fallback_copy_u16(N, X, Y);
      48            3 : }
      49              : 
      50            1 : void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
      51            1 :   __fallback_copy_s16_fp32(N, X, Y);
      52            1 : }
      53              : 
      54            1 : void copy_u16_fp32(const unsigned int N, const uint16_t *X, float *Y) {
      55            1 :   nntrainer::avx2::copy_f16_f32(N, X, Y);
      56            1 : }
      57              : 
      58            0 : void copy_fp32_u32(const unsigned int N, const float *X, uint32_t *Y) {
      59            0 :   __fallback_copy_fp32_u32(N, X, Y);
      60            0 : }
      61              : 
      62            0 : void copy_fp32_u16(const unsigned int N, const float *X, uint16_t *Y) {
      63            0 :   nntrainer::avx2::copy_f32_f16(N, X, Y);
      64            0 : }
      65              : 
      66            0 : void copy_fp32_u8(const unsigned int N, const float *X, uint8_t *Y) {
      67            0 :   __fallback_copy_fp32_u8(N, X, Y);
      68            0 : }
      69              : 
      70            0 : void copy_fp32_s16(const unsigned int N, const float *X, int16_t *Y) {
      71            0 :   __fallback_copy_fp32_s16(N, X, Y);
      72            0 : }
      73              : 
      74            2 : void copy_fp32_s8(const unsigned int N, const float *X, int8_t *Y) {
      75            2 :   __fallback_copy_fp32_s8(N, X, Y);
      76            2 : }
      77              : 
      78              : /**
      79              :  * @brief     copy function : Y = X
      80              :  * @param[in] N number of elements in X
      81              :  * @param[in] X float * for Vector X
      82              :  * @param[in] Y uint32_t * for Vector Y
      83              :  */
      84            0 : template <> void copy_fp32(const unsigned int N, const float *X, uint32_t *Y) {
      85            0 :   copy_fp32_u32(N, X, Y);
      86            0 : }
      87              : 
      88              : /**
      89              :  * @brief     copy function : Y = X
      90              :  * @param[in] N number of elements in X
      91              :  * @param[in] X float * for Vector X
      92              :  * @param[in] Y uint16_t * for Vector Y
      93              :  */
      94            0 : template <> void copy_fp32(const unsigned int N, const float *X, uint16_t *Y) {
      95            0 :   copy_fp32_u16(N, X, Y);
      96            0 : }
      97              : 
      98              : /**
      99              :  * @brief     copy function : Y = X
     100              :  * @param[in] N number of elements in X
     101              :  * @param[in] X float * for Vector X
     102              :  * @param[in] Y uint16_t * for Vector Y
     103              :  */
     104            0 : template <> void copy_fp32(const unsigned int N, const float *X, uint8_t *Y) {
     105            0 :   copy_fp32_u8(N, X, Y);
     106            0 : }
     107              : 
     108              : /**
     109              :  * @brief     copy function : Y = X
     110              :  * @param[in] N number of elements in X
     111              :  * @param[in] X float * for Vector X
     112              :  * @param[in] Y int16_t * for Vector Y
     113              :  */
     114            0 : template <> void copy_fp32(const unsigned int N, const float *X, int16_t *Y) {
     115            0 :   copy_fp32_s16(N, X, Y);
     116            0 : }
     117              : 
     118              : /**
     119              :  * @brief     copy function : Y = X
     120              :  * @param[in] N number of elements in X
     121              :  * @param[in] X float * for Vector X
     122              :  * @param[in] Y int8_t * for Vector Y
     123              :  */
     124            2 : template <> void copy_fp32(const unsigned int N, const float *X, int8_t *Y) {
     125            2 :   copy_fp32_s8(N, X, Y);
     126            2 : }
     127              : 
     128            2 : void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
     129              :                            const unsigned int incX, float *Y,
     130              :                            const unsigned int incY) {
     131            2 :   __fallback_scopy_uint8_to_float32(N, X, incX, Y, incY);
     132            2 : }
     133              : 
     134            5 : void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
     135              :                            const unsigned int incX, float *Y,
     136              :                            const unsigned int incY) {
     137            5 :   __fallback_scopy_int8_to_float32(N, X, incX, Y, incY);
     138            5 : }
     139              : 
     140              : template <>
     141           10 : void sine(const unsigned int N, float *X, float *Y, float alpha, float beta) {
     142           10 :   __fallback_sine(N, X, Y, alpha, beta);
     143           10 : }
     144              : 
     145              : template <>
     146           13 : void cosine(const unsigned int N, float *X, float *Y, float alpha, float beta) {
     147           13 :   __fallback_cosine(N, X, Y, alpha, beta);
     148           13 : }
     149              : 
     150            0 : void inv_sqrt_inplace(const unsigned int N, float *X) {
     151            0 :   __fallback_inv_sqrt_inplace(N, X);
     152            0 : }
     153              : 
     154        29926 : void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
     155              :              float alpha, float beta, unsigned int i_stride,
     156              :              unsigned int o_stride) {
     157        29926 :   nntrainer::avx2::ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
     158        29926 : }
     159              : 
     160       113033 : void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
     161              :              float alpha, float beta, unsigned int i_stride,
     162              :              unsigned int o_stride) {
     163       113033 :   nntrainer::avx2::ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
     164       113033 : }
     165              : 
     166            0 : void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
     167              :              float alpha, float beta, unsigned int i_stride,
     168              :              unsigned int o_stride) {
     169            0 :   __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
     170            0 : }
     171              : 
     172          333 : void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
     173              :              float alpha, float beta, unsigned int i_stride,
     174              :              unsigned int o_stride) {
     175          333 :   __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
     176          333 : }
     177              : 
     178        15254 : void saxpy(const unsigned int N, const float alpha, const float *X,
     179              :            const unsigned int incX, float *Y, const unsigned int incY) {
     180              : #ifdef USE_BLAS
     181        15254 :   __cblas_saxpy(N, alpha, X, incX, Y, incY);
     182              : #else
     183              :   __fallback_saxpy(N, alpha, X, incX, Y, incY);
     184              : #endif
     185        15254 : }
     186              : 
     187        88188 : void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
     188              :            const unsigned int N, const float alpha, const float *A,
     189              :            const unsigned int lda, const float *X, const unsigned int incX,
     190              :            const float beta, float *Y, const unsigned int incY) {
     191              : #ifdef USE_BLAS
     192        88188 :   __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
     193              :                 incY);
     194              : #else
     195              :   __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
     196              :                    incY);
     197              : #endif
     198        88188 : }
     199              : 
     200          245 : float sdot(const unsigned int N, const float *X, const unsigned int incX,
     201              :            const float *Y, const unsigned int incY) {
     202              : #ifdef USE_BLAS
     203          245 :   return __cblas_sdot(N, X, incX, Y, incY);
     204              : #else
     205              :   return __fallback_sdot(N, X, incX, Y, incY);
     206              : #endif
     207              : }
     208              : 
     209            0 : void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
     210              :            uint8_t *Y, const unsigned int incY) {
     211            0 :   __fallback_scopy(N, X, incX, Y, incY);
     212            0 : }
     213              : 
     214            9 : void scopy(const unsigned int N, const int8_t *X, const unsigned int incX,
     215              :            int8_t *Y, const unsigned int incY) {
     216            9 :   __fallback_scopy(N, X, incX, Y, incY);
     217            9 : }
     218              : 
     219       173980 : void scopy(const unsigned int N, const float *X, const unsigned int incX,
     220              :            float *Y, const unsigned int incY) {
     221              :   /// @note cblas_scopy is evoking SIGSEGV for some reason. Use custom
     222              :   /// implementation instead.
     223              :   // __cblas_scopy(N, X, incX, Y, incY);
     224       173980 :   nntrainer::avx2::custom_scopy(N, X, incX, Y, incY);
     225       173980 : }
     226              : 
     227         2613 : void sscal(const unsigned int N, const float alpha, float *X,
     228              :            const unsigned int incX) {
     229              : #ifdef USE_BLAS
     230         2613 :   __cblas_sscal(N, alpha, X, incX);
     231              : #else
     232              :   __fallback_sscal(N, alpha, X, incX);
     233              : #endif
     234         2613 : }
     235              : 
     236         1935 : float snrm2(const unsigned int N, const float *X, const unsigned int incX) {
     237              : #ifdef USE_BLAS
     238         1935 :   return __cblas_snrm2(N, X, incX);
     239              : #else
     240              :   return __fallback_snrm2(N, X, incX);
     241              : #endif
     242              : }
     243              : 
     244        16977 : void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
     245              :            const unsigned int M, const unsigned int N, const unsigned int K,
     246              :            const float alpha, const float *A, const unsigned int lda,
     247              :            const float *B, const unsigned int ldb, const float beta, float *C,
     248              :            const unsigned int ldc) {
     249              : #ifdef USE_BLAS
     250        16977 :   __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
     251              :                 beta, C, ldc);
     252              : #else
     253              :   __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B,
     254              :                    ldb, beta, C, ldc);
     255              : #endif
     256        16977 : }
     257              : 
     258            3 : unsigned int isamax(const unsigned int N, const float *X,
     259              :                     const unsigned int incX) {
     260              : #ifdef USE_BLAS
     261            3 :   return __cblas_isamax(N, X, incX);
     262              : #else
     263              :   return __fallback_isamax(N, X, incX);
     264              : #endif
     265              : }
     266           25 : void transpose_matrix(const unsigned int M, const unsigned int N,
     267              :                       const float *src, unsigned int ld_src, float *dst,
     268              :                       unsigned int ld_dst) {
     269           25 :   nntrainer::avx2::transpose_matrix(M, N, src, ld_src, dst, ld_dst);
     270           25 : }
     271              : 
     272           12 : bool is_valid(const unsigned int N, const float *input) {
     273           12 :   return nntrainer::avx2::is_valid(N, input);
     274              : }
     275              : 
     276            0 : void unpack_q4_0x8_transpose16(const void *src, uint16_t *d_out,
     277              :                                uint16_t *qs_out, int N, int K) {
     278            0 :   return nntrainer::avx2::unpack_q4_0x8_transpose16(src, d_out, qs_out, N, K);
     279              : }
     280              : 
     281              : template <>
     282            0 : void calc_trigonometric_vals_dup(unsigned int N_half, float *angle, float *cos_,
     283              :                                  float *sin_, unsigned int from,
     284              :                                  float attention_scaling) {
     285            0 :   __fallback_calc_trigonometric_vals_dup(N_half, angle, cos_, sin_, from,
     286              :                                          attention_scaling);
     287            0 : }
     288              : 
     289            0 : void swiglu(const unsigned int N, float *X, float *Y, float *Z) {
     290            0 :   nntrainer::avx2::swiglu(N, X, Y, Z);
     291            0 : }
     292              : 
     293            0 : void swiglu(const unsigned int N, float *X, float *Y, float *Z, float alpha) {
     294            0 :   nntrainer::avx2::swiglu(N, X, Y, Z, alpha);
     295            0 : }
     296              : 
     297            0 : float max_val(const unsigned int N, float *X) { return __fallback_max(N, X); }
     298              : 
     299            0 : void softmax(const unsigned int N, float *X, float *Y) {
     300            0 :   __fallback_softmax(N, X, Y);
     301            0 : }
     302              : 
     303              : template <>
     304           55 : void gemm_q4_0(const unsigned int M, const unsigned int N, const unsigned int K,
     305              :                const float *A, const unsigned int lda, const void *B,
     306              :                const unsigned int ldb, float *C, const unsigned int ldc) {
     307           55 :   return __ggml_q4_0_8x8_q8_0_GEMM(M, N, K, A, lda, B, ldb, C, ldc);
     308              : }
     309              : 
     310            0 : void gemm_q4_0(const unsigned int M, std::vector<unsigned int> Ns,
     311              :                const unsigned int K, const float *A, const unsigned int lda,
     312              :                std::vector<void *> Bs, std::vector<unsigned int> ldbs,
     313              :                std::vector<float *> Cs, std::vector<unsigned int> ldcs) {
     314            0 :   throw std::runtime_error("Error: NYI for gemm_q4_0 with vectored weights");
     315              : }
     316              : 
     317            9 : void gemm_q4_K(const unsigned int M, const unsigned int N, const unsigned int K,
     318              :                const float *A, const unsigned int lda, const void *B,
     319              :                const unsigned int ldb, float *C, const unsigned int ldc) {
     320            9 :   return __ggml_q4_K_8x8_q8_K_GEMM(M, N, K, A, lda, B, ldb, C, ldc);
     321              : }
     322              : 
     323            0 : void gemm_q4_K(const unsigned int M, std::vector<unsigned int> Ns,
     324              :                const unsigned int K, const float *A, const unsigned int lda,
     325              :                std::vector<void *> Bs, std::vector<unsigned int> ldbs,
     326              :                std::vector<float *> Cs, std::vector<unsigned int> ldcs) {
     327            0 :   return __ggml_q4_K_8x8_q8_K_GEMM(M, Ns, K, A, lda, Bs, ldbs, Cs, ldcs);
     328              : }
     329              : 
     330           63 : float dot_q6_K_q8_K(const unsigned int K, const void *v_q6_K,
     331              :                     const void *v_q8_K) {
     332           63 :   return __ggml_vec_dot_q6_K_q8_K(K, v_q6_K, v_q8_K);
     333              : }
     334              : 
     335            0 : float dot_q6_K_f32(const unsigned int K, const void *v_q6_K, const float *f) {
     336            0 :   return __ggml_vec_dot_q6_K_f32(K, v_q6_K, f);
     337              : }
     338              : 
     339              : template <>
     340            7 : void gemm_q6_K(const unsigned int M, const unsigned int N, const unsigned int K,
     341              :                const float *A, const unsigned int lda, const void *B,
     342              :                const unsigned int ldb, float *C, const unsigned int ldc) {
     343            7 :   return __ggml_gemm_q6_K(M, N, K, A, lda, B, ldb, C, ldc);
     344              : }
     345              : 
     346           31 : size_t quantize_q4_0(const float *src, void *dst, int64_t nrow,
     347              :                      int64_t n_per_row, const float *quant_weights) {
     348           31 :   return __ggml_quantize_q4_0(src, dst, nrow, n_per_row, quant_weights);
     349              : }
     350              : 
     351            8 : size_t quantize_q4_K(const float *src, void *dst, int64_t nrow,
     352              :                      int64_t n_per_row, const float *quant_weights) {
     353            8 :   return __ggml_quantize_q4_K(src, dst, nrow, n_per_row, quant_weights);
     354              : }
     355              : 
     356            9 : size_t quantize_q6_K(const float *src, void *dst, int64_t nrow,
     357              :                      int64_t n_per_row, const float *quant_weights) {
     358            9 :   return __ggml_quantize_q6_K(src, dst, nrow, n_per_row, quant_weights);
     359              : }
     360              : 
     361           63 : void quantize_row_q6_K(const float *src, void *dst, int64_t k) {
     362           63 :   __ggml_quantize_row_q6_K(src, dst, k);
     363           63 : }
     364              : 
     365           63 : template <> void quantize_row_q8_K(const float *src, void *dst, int64_t k) {
     366           63 :   __ggml_quantize_row_q8_K(src, dst, k);
     367           63 : }
     368              : 
     369            1 : void dequantize_row_q4_K(const void *x_raw, float *y, int64_t k) {
     370            1 :   __ggml_dequantize_row_q4_K(x_raw, y, k);
     371            1 : }
     372              : 
     373           49 : void dequantize_row_q4_0(const void *x_raw, float *y, int64_t k) {
     374           49 :   __ggml_dequantize_row_q4_0(x_raw, y, k);
     375           49 : }
     376              : 
     377            2 : void dequantize_row_q6_K(const void *x, float *y, int64_t k) {
     378            2 :   __ggml_dequantize_row_q6_K(x, y, k);
     379            2 : }
     380              : 
     381            0 : template <> void dequantize_row_q8_K(const void *x, float *y, int64_t k) {
     382            0 :   __ggml_dequantize_row_q8_K(x, y, k);
     383            0 : }
     384              : 
     385           30 : void repack_q4_0(void *W, void *repacked_W, size_t data_size,
     386              :                  const unsigned int M, const unsigned int N) {
     387           30 :   __ggml_repack_q4_0_to_q4_0_8(W, repacked_W, data_size, M, N);
     388           30 : }
     389              : 
     390            0 : void repack_q4_0_to_q4_0_8(void *W, void *repacked_W, size_t data_size,
     391              :                            const unsigned int M, const unsigned int N) {
     392            0 :   __ggml_repack_q4_0_to_q4_0_8(W, repacked_W, data_size, M, N);
     393            0 : }
     394              : 
     395            7 : void repack_q4_K(void *W, void *repacked_W, size_t data_size,
     396              :                  const unsigned int M, const unsigned int N) {
     397            7 :   __ggml_repack_q4_K_to_q4_K_8(W, repacked_W, data_size, M, N);
     398            7 : }
     399              : 
     400           48 : void unpack_q4_0(const void *in_q4_0x, void *out_q4_0, size_t data_size,
     401              :                  const unsigned int M, const unsigned int N) {
     402           48 :   Q4_0Utils::unpackBlocksQ4_0x8((const block_q4_0x8 *)in_q4_0x, data_size, M, N,
     403              :                                 (block_q4_0 *)out_q4_0);
     404           48 : }
     405              : 
     406              : template <>
     407            1 : void softmax_row_inplace(float *qk_out, size_t start_row, size_t end_row,
     408              :                          size_t num_heads, float *sink) {
     409            1 :   nntrainer::avx2::softmax_row_inplace<float>(qk_out, start_row, end_row,
     410              :                                               num_heads, sink);
     411            1 : }
     412              : 
     413              : template <>
     414            1 : void softmax_row(float *qk_out, size_t start_row, size_t end_row,
     415              :                  size_t num_heads, float *sink) {
     416            1 :   nntrainer::avx2::softmax_row<float>(qk_out, start_row, end_row, num_heads,
     417              :                                       sink);
     418            1 : }
     419              : 
     420            1 : void compute_fp16vcache_fp32_transposed(int row_num, const float *in,
     421              :                                         const uint16_t *vcache, float *output,
     422              :                                         int num_cache_head, int gqa_size,
     423              :                                         int head_dim,
     424              :                                         size_t local_window_size) {
     425            1 :   nntrainer::avx2::compute_fp16vcache_fp32_transposed(
     426              :     row_num, in, vcache, output, num_cache_head, gqa_size, head_dim,
     427              :     local_window_size);
     428            1 : }
     429              : 
     430              : template <>
     431            1 : void compute_kcaches(const float *in, const uint16_t *kcache, float *output,
     432              :                      int num_rows, int num_cache_head, int head_dim,
     433              :                      int gqa_size, int tile_size, size_t local_window_size) {
     434            1 :   nntrainer::avx2::compute_kcaches<uint16_t>(in, kcache, output, num_rows,
     435              :                                              num_cache_head, head_dim, gqa_size,
     436              :                                              tile_size, local_window_size);
     437            1 : }
     438              : 
     439            2 : void compute_rotary_emb_value(unsigned int width, unsigned int dim,
     440              :                               unsigned int half_, float *inout, void *output,
     441              :                               const float *cos_, const float *sin_,
     442              :                               bool only_convert_to_fp16) {
     443            2 :   nntrainer::avx2::compute_rotary_emb_value(width, dim, half_, inout, output,
     444              :                                             cos_, sin_, only_convert_to_fp16);
     445            2 : }
     446              : 
     447            0 : void rms_norm_wrt_width_fp32_intrinsic(const float *__restrict X,
     448              :                                        float *__restrict Y, size_t H, size_t W,
     449              :                                        float epsilon) {
     450            0 :   nntrainer::avx2::rms_norm_wrt_width_fp32_intrinsic(X, Y, H, W, epsilon);
     451            0 : }
     452              : 
     453              : template <>
     454            0 : void rms_norm_wrt_width_fp16_intrinsic(const float *__restrict X,
     455              :                                        float *__restrict Y, size_t H, size_t W,
     456              :                                        float epsilon) {
     457            0 :   __fallback_rms_norm_wrt_width_fp16_intrinsic(X, Y, H, W, epsilon);
     458            0 : }
     459              : 
     460              : template <>
     461           21 : void clamp(const float *input, float *output, size_t length, float lower_bound,
     462              :            float upper_bound) {
     463           21 :   nntrainer::avx2::clamp(input, output, length, lower_bound, upper_bound);
     464           21 : }
     465              : 
     466            0 : void create_q4_0_weights(const uint8_t *int4_weight, uint8_t *q4_0_weight) {
     467            0 :   nntrainer::avx2::create_q4_0_weights(int4_weight, q4_0_weight);
     468            0 : }
     469              : 
     470         2400 : void transform_int4_osv32_isv2_to_q4_0(size_t N, size_t K,
     471              :                                        const uint8_t *osv32_weights,
     472              :                                        const uint16_t *osv32_scales,
     473              :                                        size_t scale_group_size,
     474              :                                        void *dst_q4_0x) {
     475              : #ifdef __AVX2__
     476         2400 :   nntrainer::avx2::transform_int4_osv32_isv2_to_q4_0x8(
     477              :     N, K, osv32_weights, osv32_scales, scale_group_size, dst_q4_0x);
     478              : #else
     479              :   __fallback_transform_int4_osv32_isv2_to_q4_0(
     480              :     N, K, osv32_weights, osv32_scales, scale_group_size, 8, dst_q4_0x);
     481              : #endif
     482         2400 : }
     483              : } /* namespace nntrainer */
        

Generated by: LCOV version 2.0-1