LCOV - coverage_filtered.info - nntrainer/tensor/cpu_backend/x86/x86_compute

LCOV - code coverage report

Current view:	top level - nntrainer/tensor/cpu_backend/x86 - x86_compute_backend.cpp (source / functions)		Coverage	Total	Hit
Test:	coverage_filtered.info	Lines:	65.5 %	200	131
Test Date:	2025-12-14 20:38:17	Functions:	65.8 %	73	48

            Line data    Source code

       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
       4              :  *
       5              :  * @file   x86_compute_backend.cpp
       6              :  * @date   23 April 2024
       7              :  * @see    https://github.com/nnstreamer/nntrainer
       8              :  * @author Sungsik Kong <ss.kong@samsung.com>
       9              :  * @bug    No known bugs except for NYI items
      10              :  * @brief  Compute backend for x86
      11              :  *
      12              :  */
      13              : 
      14              : #include <assert.h>
      15              : 
      16              : #include <avx2_impl.h>
      17              : #ifdef USE_BLAS
      18              : #include <cblas_interface.h>
      19              : #endif
      20              : #include <fallback_internal.h>
      21              : #include <ggml_interface.h>
      22              : #include <nntrainer_error.h>
      23              : #include <q4_0_utils.h>
      24              : #include <x86_compute_backend.h>
      25              : 
      26              : #define ROW_MAJOR 0
      27              : #define COL_MAJOR 1
      28              : 
      29              : namespace nntrainer {
      30              : 
      31           37 : void init_backend() { __ggml_init(); }
      32              : 
      33            0 : void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
      34              :                            const unsigned int incX, float *Y,
      35              :                            const unsigned int incY) {
      36            0 :   __fallback_scopy_int4_to_float32(N, X, incX, Y, incY);
      37            0 : }
      38              : 
      39            2 : void copy_s16(const unsigned int N, const int16_t *X, int16_t *Y) {
      40            2 :   __fallback_copy_s16(N, X, Y);
      41            2 : }
      42              : 
      43            3 : void copy_u16(const unsigned int N, const uint16_t *X, uint16_t *Y) {
      44            3 :   __fallback_copy_u16(N, X, Y);
      45            3 : }
      46              : 
      47            1 : void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
      48            1 :   __fallback_copy_s16_fp32(N, X, Y);
      49            1 : }
      50              : 
      51            1 : void copy_u16_fp32(const unsigned int N, const uint16_t *X, float *Y) {
      52            1 :   nntrainer::avx2::copy_f16_f32(N, X, Y);
      53            1 : }
      54              : 
      55            0 : void copy_fp32_u32(const unsigned int N, const float *X, uint32_t *Y) {
      56            0 :   __fallback_copy_fp32_u32(N, X, Y);
      57            0 : }
      58              : 
      59            0 : void copy_fp32_u16(const unsigned int N, const float *X, uint16_t *Y) {
      60            0 :   nntrainer::avx2::copy_f32_f16(N, X, Y);
      61            0 : }
      62              : 
      63            0 : void copy_fp32_u8(const unsigned int N, const float *X, uint8_t *Y) {
      64            0 :   __fallback_copy_fp32_u8(N, X, Y);
      65            0 : }
      66              : 
      67            0 : void copy_fp32_s16(const unsigned int N, const float *X, int16_t *Y) {
      68            0 :   __fallback_copy_fp32_s16(N, X, Y);
      69            0 : }
      70              : 
      71            2 : void copy_fp32_s8(const unsigned int N, const float *X, int8_t *Y) {
      72            2 :   __fallback_copy_fp32_s8(N, X, Y);
      73            2 : }
      74              : 
      75              : /**
      76              :  * @brief     copy function : Y = X
      77              :  * @param[in] N number of elements in X
      78              :  * @param[in] X float * for Vector X
      79              :  * @param[in] Y uint32_t * for Vector Y
      80              :  */
      81            0 : template <> void copy_fp32(const unsigned int N, const float *X, uint32_t *Y) {
      82            0 :   copy_fp32_u32(N, X, Y);
      83            0 : }
      84              : 
      85              : /**
      86              :  * @brief     copy function : Y = X
      87              :  * @param[in] N number of elements in X
      88              :  * @param[in] X float * for Vector X
      89              :  * @param[in] Y uint16_t * for Vector Y
      90              :  */
      91            0 : template <> void copy_fp32(const unsigned int N, const float *X, uint16_t *Y) {
      92            0 :   copy_fp32_u16(N, X, Y);
      93            0 : }
      94              : 
      95              : /**
      96              :  * @brief     copy function : Y = X
      97              :  * @param[in] N number of elements in X
      98              :  * @param[in] X float * for Vector X
      99              :  * @param[in] Y uint16_t * for Vector Y
     100              :  */
     101            0 : template <> void copy_fp32(const unsigned int N, const float *X, uint8_t *Y) {
     102            0 :   copy_fp32_u8(N, X, Y);
     103            0 : }
     104              : 
     105              : /**
     106              :  * @brief     copy function : Y = X
     107              :  * @param[in] N number of elements in X
     108              :  * @param[in] X float * for Vector X
     109              :  * @param[in] Y int16_t * for Vector Y
     110              :  */
     111            0 : template <> void copy_fp32(const unsigned int N, const float *X, int16_t *Y) {
     112            0 :   copy_fp32_s16(N, X, Y);
     113            0 : }
     114              : 
     115              : /**
     116              :  * @brief     copy function : Y = X
     117              :  * @param[in] N number of elements in X
     118              :  * @param[in] X float * for Vector X
     119              :  * @param[in] Y int8_t * for Vector Y
     120              :  */
     121            2 : template <> void copy_fp32(const unsigned int N, const float *X, int8_t *Y) {
     122            2 :   copy_fp32_s8(N, X, Y);
     123            2 : }
     124              : 
     125            2 : void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
     126              :                            const unsigned int incX, float *Y,
     127              :                            const unsigned int incY) {
     128            2 :   __fallback_scopy_uint8_to_float32(N, X, incX, Y, incY);
     129            2 : }
     130              : 
     131            5 : void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
     132              :                            const unsigned int incX, float *Y,
     133              :                            const unsigned int incY) {
     134            5 :   __fallback_scopy_int8_to_float32(N, X, incX, Y, incY);
     135            5 : }
     136              : 
     137              : template <>
     138           10 : void sine(const unsigned int N, float *X, float *Y, float alpha, float beta) {
     139           10 :   __fallback_sine(N, X, Y, alpha, beta);
     140           10 : }
     141              : 
     142              : template <>
     143           13 : void cosine(const unsigned int N, float *X, float *Y, float alpha, float beta) {
     144           13 :   __fallback_cosine(N, X, Y, alpha, beta);
     145           13 : }
     146              : 
     147            0 : void inv_sqrt_inplace(const unsigned int N, float *X) {
     148            0 :   __fallback_inv_sqrt_inplace(N, X);
     149            0 : }
     150              : 
     151        29926 : void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
     152              :              float alpha, float beta, unsigned int i_stride,
     153              :              unsigned int o_stride) {
     154        29926 :   nntrainer::avx2::ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
     155        29926 : }
     156              : 
     157       113033 : void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
     158              :              float alpha, float beta, unsigned int i_stride,
     159              :              unsigned int o_stride) {
     160       113033 :   nntrainer::avx2::ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
     161       113033 : }
     162              : 
     163            0 : void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
     164              :              float alpha, float beta, unsigned int i_stride,
     165              :              unsigned int o_stride) {
     166            0 :   __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
     167            0 : }
     168              : 
     169          333 : void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
     170              :              float alpha, float beta, unsigned int i_stride,
     171              :              unsigned int o_stride) {
     172          333 :   __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
     173          333 : }
     174              : 
     175        15254 : void saxpy(const unsigned int N, const float alpha, const float *X,
     176              :            const unsigned int incX, float *Y, const unsigned int incY) {
     177              : #ifdef USE_BLAS
     178        15254 :   __cblas_saxpy(N, alpha, X, incX, Y, incY);
     179              : #else
     180              :   __fallback_saxpy(N, alpha, X, incX, Y, incY);
     181              : #endif
     182        15254 : }
     183              : 
     184        88188 : void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
     185              :            const unsigned int N, const float alpha, const float *A,
     186              :            const unsigned int lda, const float *X, const unsigned int incX,
     187              :            const float beta, float *Y, const unsigned int incY) {
     188              : #ifdef USE_BLAS
     189        88188 :   __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
     190              :                 incY);
     191              : #else
     192              :   __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
     193              :                    incY);
     194              : #endif
     195        88188 : }
     196              : 
     197          245 : float sdot(const unsigned int N, const float *X, const unsigned int incX,
     198              :            const float *Y, const unsigned int incY) {
     199              : #ifdef USE_BLAS
     200          245 :   return __cblas_sdot(N, X, incX, Y, incY);
     201              : #else
     202              :   return __fallback_sdot(N, X, incX, Y, incY);
     203              : #endif
     204              : }
     205              : 
     206            0 : void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
     207              :            uint8_t *Y, const unsigned int incY) {
     208            0 :   __fallback_scopy(N, X, incX, Y, incY);
     209            0 : }
     210              : 
     211            9 : void scopy(const unsigned int N, const int8_t *X, const unsigned int incX,
     212              :            int8_t *Y, const unsigned int incY) {
     213            9 :   __fallback_scopy(N, X, incX, Y, incY);
     214            9 : }
     215              : 
     216       173980 : void scopy(const unsigned int N, const float *X, const unsigned int incX,
     217              :            float *Y, const unsigned int incY) {
     218              :   /// @note cblas_scopy is evoking SIGSEGV for some reason. Use custom
     219              :   /// implementation instead.
     220              :   // __cblas_scopy(N, X, incX, Y, incY);
     221       173980 :   nntrainer::avx2::custom_scopy(N, X, incX, Y, incY);
     222       173980 : }
     223              : 
     224         2609 : void sscal(const unsigned int N, const float alpha, float *X,
     225              :            const unsigned int incX) {
     226              : #ifdef USE_BLAS
     227         2609 :   __cblas_sscal(N, alpha, X, incX);
     228              : #else
     229              :   __fallback_sscal(N, alpha, X, incX);
     230              : #endif
     231         2609 : }
     232              : 
     233         1931 : float snrm2(const unsigned int N, const float *X, const unsigned int incX) {
     234              : #ifdef USE_BLAS
     235         1931 :   return __cblas_snrm2(N, X, incX);
     236              : #else
     237              :   return __fallback_snrm2(N, X, incX);
     238              : #endif
     239              : }
     240              : 
     241        16977 : void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
     242              :            const unsigned int M, const unsigned int N, const unsigned int K,
     243              :            const float alpha, const float *A, const unsigned int lda,
     244              :            const float *B, const unsigned int ldb, const float beta, float *C,
     245              :            const unsigned int ldc) {
     246              : #ifdef USE_BLAS
     247        16977 :   __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
     248              :                 beta, C, ldc);
     249              : #else
     250              :   __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B,
     251              :                    ldb, beta, C, ldc);
     252              : #endif
     253        16977 : }
     254              : 
     255            3 : unsigned int isamax(const unsigned int N, const float *X,
     256              :                     const unsigned int incX) {
     257              : #ifdef USE_BLAS
     258            3 :   return __cblas_isamax(N, X, incX);
     259              : #else
     260              :   return __fallback_isamax(N, X, incX);
     261              : #endif
     262              : }
     263           25 : void transpose_matrix(const unsigned int M, const unsigned int N,
     264              :                       const float *src, unsigned int ld_src, float *dst,
     265              :                       unsigned int ld_dst) {
     266           25 :   nntrainer::avx2::transpose_matrix(M, N, src, ld_src, dst, ld_dst);
     267           25 : }
     268              : 
     269           12 : bool is_valid(const unsigned int N, const float *input) {
     270           12 :   return nntrainer::avx2::is_valid(N, input);
     271              : }
     272              : 
     273            0 : void unpack_q4_0x8_transpose16(const void *src, uint16_t *d_out,
     274              :                                uint16_t *qs_out, int N, int K) {
     275            0 :   return nntrainer::avx2::unpack_q4_0x8_transpose16(src, d_out, qs_out, N, K);
     276              : }
     277              : 
     278              : template <>
     279            0 : void calc_trigonometric_vals_dup(unsigned int N_half, float *angle, float *cos_,
     280              :                                  float *sin_, unsigned int from,
     281              :                                  float attention_scaling) {
     282            0 :   __fallback_calc_trigonometric_vals_dup(N_half, angle, cos_, sin_, from,
     283              :                                          attention_scaling);
     284            0 : }
     285              : 
     286            0 : void swiglu(const unsigned int N, float *X, float *Y, float *Z) {
     287            0 :   nntrainer::avx2::swiglu(N, X, Y, Z);
     288            0 : }
     289              : 
     290            0 : void swiglu(const unsigned int N, float *X, float *Y, float *Z, float alpha) {
     291            0 :   nntrainer::avx2::swiglu(N, X, Y, Z, alpha);
     292            0 : }
     293              : 
     294            0 : float max_val(const unsigned int N, float *X) { return __fallback_max(N, X); }
     295              : 
     296            0 : void softmax(const unsigned int N, float *X, float *Y) {
     297            0 :   __fallback_softmax(N, X, Y);
     298            0 : }
     299              : 
     300              : template <>
     301           55 : void gemm_q4_0(const unsigned int M, const unsigned int N, const unsigned int K,
     302              :                const float *A, const unsigned int lda, const void *B,
     303              :                const unsigned int ldb, float *C, const unsigned int ldc) {
     304           55 :   return __ggml_q4_0_8x8_q8_0_GEMM(M, N, K, A, lda, B, ldb, C, ldc);
     305              : }
     306              : 
     307            0 : void gemm_q4_0(const unsigned int M, std::vector<unsigned int> Ns,
     308              :                const unsigned int K, const float *A, const unsigned int lda,
     309              :                std::vector<void *> Bs, std::vector<unsigned int> ldbs,
     310              :                std::vector<float *> Cs, std::vector<unsigned int> ldcs) {
     311            0 :   throw std::runtime_error("Error: NYI for gemm_q4_0 with vectored weights");
     312              : }
     313              : 
     314            9 : void gemm_q4_K(const unsigned int M, const unsigned int N, const unsigned int K,
     315              :                const float *A, const unsigned int lda, const void *B,
     316              :                const unsigned int ldb, float *C, const unsigned int ldc) {
     317            9 :   return __ggml_q4_K_8x8_q8_K_GEMM(M, N, K, A, lda, B, ldb, C, ldc);
     318              : }
     319              : 
     320            0 : void gemm_q4_K(const unsigned int M, std::vector<unsigned int> Ns,
     321              :                const unsigned int K, const float *A, const unsigned int lda,
     322              :                std::vector<void *> Bs, std::vector<unsigned int> ldbs,
     323              :                std::vector<float *> Cs, std::vector<unsigned int> ldcs) {
     324            0 :   return __ggml_q4_K_8x8_q8_K_GEMM(M, Ns, K, A, lda, Bs, ldbs, Cs, ldcs);
     325              : }
     326              : 
     327           63 : float dot_q6_K_q8_K(const unsigned int K, const void *v_q6_K,
     328              :                     const void *v_q8_K) {
     329           63 :   return __ggml_vec_dot_q6_K_q8_K(K, v_q6_K, v_q8_K);
     330              : }
     331              : 
     332            0 : float dot_q6_K_f32(const unsigned int K, const void *v_q6_K, const float *f) {
     333            0 :   return __ggml_vec_dot_q6_K_f32(K, v_q6_K, f);
     334              : }
     335              : 
     336              : template <>
     337            7 : void gemm_q6_K(const unsigned int M, const unsigned int N, const unsigned int K,
     338              :                const float *A, const unsigned int lda, const void *B,
     339              :                const unsigned int ldb, float *C, const unsigned int ldc) {
     340            7 :   return __ggml_gemm_q6_K(M, N, K, A, lda, B, ldb, C, ldc);
     341              : }
     342              : 
     343           31 : size_t quantize_q4_0(const float *src, void *dst, int64_t nrow,
     344              :                      int64_t n_per_row, const float *quant_weights) {
     345           31 :   return __ggml_quantize_q4_0(src, dst, nrow, n_per_row, quant_weights);
     346              : }
     347              : 
     348            8 : size_t quantize_q4_K(const float *src, void *dst, int64_t nrow,
     349              :                      int64_t n_per_row, const float *quant_weights) {
     350            8 :   return __ggml_quantize_q4_K(src, dst, nrow, n_per_row, quant_weights);
     351              : }
     352              : 
     353            9 : size_t quantize_q6_K(const float *src, void *dst, int64_t nrow,
     354              :                      int64_t n_per_row, const float *quant_weights) {
     355            9 :   return __ggml_quantize_q6_K(src, dst, nrow, n_per_row, quant_weights);
     356              : }
     357              : 
     358           63 : void quantize_row_q6_K(const float *src, void *dst, int64_t k) {
     359           63 :   __ggml_quantize_row_q6_K(src, dst, k);
     360           63 : }
     361              : 
     362           63 : template <> void quantize_row_q8_K(const float *src, void *dst, int64_t k) {
     363           63 :   __ggml_quantize_row_q8_K(src, dst, k);
     364           63 : }
     365              : 
     366            1 : void dequantize_row_q4_K(const void *x_raw, float *y, int64_t k) {
     367            1 :   __ggml_dequantize_row_q4_K(x_raw, y, k);
     368            1 : }
     369              : 
     370           49 : void dequantize_row_q4_0(const void *x_raw, float *y, int64_t k) {
     371           49 :   __ggml_dequantize_row_q4_0(x_raw, y, k);
     372           49 : }
     373              : 
     374            2 : void dequantize_row_q6_K(const void *x, float *y, int64_t k) {
     375            2 :   __ggml_dequantize_row_q6_K(x, y, k);
     376            2 : }
     377              : 
     378            0 : template <> void dequantize_row_q8_K(const void *x, float *y, int64_t k) {
     379            0 :   __ggml_dequantize_row_q8_K(x, y, k);
     380            0 : }
     381              : 
     382           30 : void repack_q4_0(void *W, void *repacked_W, size_t data_size,
     383              :                  const unsigned int M, const unsigned int N) {
     384           30 :   __ggml_repack_q4_0_to_q4_0_8(W, repacked_W, data_size, M, N);
     385           30 : }
     386              : 
     387            0 : void repack_q4_0_to_q4_0_8(void *W, void *repacked_W, size_t data_size,
     388              :                            const unsigned int M, const unsigned int N) {
     389            0 :   __ggml_repack_q4_0_to_q4_0_8(W, repacked_W, data_size, M, N);
     390            0 : }
     391              : 
     392            7 : void repack_q4_K(void *W, void *repacked_W, size_t data_size,
     393              :                  const unsigned int M, const unsigned int N) {
     394            7 :   __ggml_repack_q4_K_to_q4_K_8(W, repacked_W, data_size, M, N);
     395            7 : }
     396              : 
     397           48 : void unpack_q4_0(const void *in_q4_0x, void *out_q4_0, size_t data_size,
     398              :                  const unsigned int M, const unsigned int N) {
     399           48 :   Q4_0Utils::unpackBlocksQ4_0x8((const block_q4_0x8 *)in_q4_0x, data_size, M, N,
     400              :                                 (block_q4_0 *)out_q4_0);
     401           48 : }
     402              : 
     403              : template <>
     404            1 : void softmax_row_inplace(float *qk_out, size_t start_row, size_t end_row,
     405              :                          size_t num_heads, float *sink) {
     406            1 :   nntrainer::avx2::softmax_row_inplace<float>(qk_out, start_row, end_row,
     407              :                                               num_heads, sink);
     408            1 : }
     409              : 
     410              : template <>
     411            1 : void softmax_row(float *qk_out, size_t start_row, size_t end_row,
     412              :                  size_t num_heads, float *sink) {
     413            1 :   nntrainer::avx2::softmax_row<float>(qk_out, start_row, end_row, num_heads,
     414              :                                       sink);
     415            1 : }
     416              : 
     417            1 : void compute_fp16vcache_fp32_transposed(int row_num, const float *in,
     418              :                                         const uint16_t *vcache, float *output,
     419              :                                         int num_cache_head, int gqa_size,
     420              :                                         int head_dim,
     421              :                                         size_t local_window_size) {
     422            1 :   nntrainer::avx2::compute_fp16vcache_fp32_transposed(
     423              :     row_num, in, vcache, output, num_cache_head, gqa_size, head_dim,
     424              :     local_window_size);
     425            1 : }
     426              : 
     427              : template <>
     428            1 : void compute_kcaches(const float *in, const uint16_t *kcache, float *output,
     429              :                      int num_rows, int num_cache_head, int head_dim,
     430              :                      int gqa_size, int tile_size, size_t local_window_size) {
     431            1 :   nntrainer::avx2::compute_kcaches<uint16_t>(in, kcache, output, num_rows,
     432              :                                              num_cache_head, head_dim, gqa_size,
     433              :                                              tile_size, local_window_size);
     434            1 : }
     435              : 
     436            2 : void compute_rotary_emb_value(unsigned int width, unsigned int dim,
     437              :                               unsigned int half_, float *inout, void *output,
     438              :                               const float *cos_, const float *sin_,
     439              :                               bool only_convert_to_fp16) {
     440            2 :   nntrainer::avx2::compute_rotary_emb_value(width, dim, half_, inout, output,
     441              :                                             cos_, sin_, only_convert_to_fp16);
     442            2 : }
     443              : 
     444            0 : void rms_norm_wrt_width_fp32_intrinsic(const float *__restrict X,
     445              :                                        float *__restrict Y, size_t H, size_t W,
     446              :                                        float epsilon) {
     447            0 :   nntrainer::avx2::rms_norm_wrt_width_fp32_intrinsic(X, Y, H, W, epsilon);
     448            0 : }
     449              : 
     450              : template <>
     451            0 : void rms_norm_wrt_width_fp16_intrinsic(const float *__restrict X,
     452              :                                        float *__restrict Y, size_t H, size_t W,
     453              :                                        float epsilon) {
     454            0 :   __fallback_rms_norm_wrt_width_fp16_intrinsic(X, Y, H, W, epsilon);
     455            0 : }
     456              : 
     457              : template <>
     458           21 : void clamp(const float *input, float *output, size_t length, float lower_bound,
     459              :            float upper_bound) {
     460           21 :   nntrainer::avx2::clamp(input, output, length, lower_bound, upper_bound);
     461           21 : }
     462              : 
     463      1752656 : void create_q4_0_weights(const uint8_t *int4_weight, uint8_t *q4_0_weight) {
     464      1752656 :   nntrainer::avx2::create_q4_0_weights(int4_weight, q4_0_weight);
     465      1752656 : }
     466              : 
     467           24 : void transform_q4_0x_from_int4(size_t N, size_t K, const uint8_t *osv32_weights,
     468              :                                const uint16_t *osv32_scales,
     469              :                                size_t scale_group_size, void *dst_q4_0x) {
     470           24 :   Q4_0Utils::transformQ4_0x_FromInt4(N, K, osv32_weights, osv32_scales,
     471              :                                      scale_group_size, 8, dst_q4_0x);
     472           24 : }
     473              : } /* namespace nntrainer */

Generated by: LCOV version 2.0-1