Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
4 : *
5 : * @file x86_compute_backend.cpp
6 : * @date 23 April 2024
7 : * @see https://github.com/nnstreamer/nntrainer
8 : * @author Sungsik Kong <ss.kong@samsung.com>
9 : * @bug No known bugs except for NYI items
10 : * @brief Compute backend for x86
11 : *
12 : */
13 :
14 : #include <assert.h>
15 :
16 : #include <avx2_impl.h>
17 : #ifdef USE_BLAS
18 : #include <cblas_interface.h>
19 : #endif
20 : #include <fallback_internal.h>
21 : #include <ggml_interface.h>
22 : #include <nntrainer_error.h>
23 : #include <q4_0_utils.h>
24 : #include <x86_compute_backend.h>
25 :
26 : #define ROW_MAJOR 0
27 : #define COL_MAJOR 1
28 :
29 : namespace nntrainer {
30 :
31 37 : void init_backend() { __ggml_init(); }
32 :
33 0 : void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
34 : const unsigned int incX, float *Y,
35 : const unsigned int incY) {
36 0 : __fallback_scopy_int4_to_float32(N, X, incX, Y, incY);
37 0 : }
38 :
39 2 : void copy_s16(const unsigned int N, const int16_t *X, int16_t *Y) {
40 2 : __fallback_copy_s16(N, X, Y);
41 2 : }
42 :
43 3 : void copy_u16(const unsigned int N, const uint16_t *X, uint16_t *Y) {
44 3 : __fallback_copy_u16(N, X, Y);
45 3 : }
46 :
47 1 : void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
48 1 : __fallback_copy_s16_fp32(N, X, Y);
49 1 : }
50 :
51 1 : void copy_u16_fp32(const unsigned int N, const uint16_t *X, float *Y) {
52 1 : nntrainer::avx2::copy_f16_f32(N, X, Y);
53 1 : }
54 :
55 0 : void copy_fp32_u32(const unsigned int N, const float *X, uint32_t *Y) {
56 0 : __fallback_copy_fp32_u32(N, X, Y);
57 0 : }
58 :
59 0 : void copy_fp32_u16(const unsigned int N, const float *X, uint16_t *Y) {
60 0 : nntrainer::avx2::copy_f32_f16(N, X, Y);
61 0 : }
62 :
63 0 : void copy_fp32_u8(const unsigned int N, const float *X, uint8_t *Y) {
64 0 : __fallback_copy_fp32_u8(N, X, Y);
65 0 : }
66 :
67 0 : void copy_fp32_s16(const unsigned int N, const float *X, int16_t *Y) {
68 0 : __fallback_copy_fp32_s16(N, X, Y);
69 0 : }
70 :
71 2 : void copy_fp32_s8(const unsigned int N, const float *X, int8_t *Y) {
72 2 : __fallback_copy_fp32_s8(N, X, Y);
73 2 : }
74 :
75 : /**
76 : * @brief copy function : Y = X
77 : * @param[in] N number of elements in X
78 : * @param[in] X float * for Vector X
79 : * @param[in] Y uint32_t * for Vector Y
80 : */
81 0 : template <> void copy_fp32(const unsigned int N, const float *X, uint32_t *Y) {
82 0 : copy_fp32_u32(N, X, Y);
83 0 : }
84 :
85 : /**
86 : * @brief copy function : Y = X
87 : * @param[in] N number of elements in X
88 : * @param[in] X float * for Vector X
89 : * @param[in] Y uint16_t * for Vector Y
90 : */
91 0 : template <> void copy_fp32(const unsigned int N, const float *X, uint16_t *Y) {
92 0 : copy_fp32_u16(N, X, Y);
93 0 : }
94 :
95 : /**
96 : * @brief copy function : Y = X
97 : * @param[in] N number of elements in X
98 : * @param[in] X float * for Vector X
99 : * @param[in] Y uint16_t * for Vector Y
100 : */
101 0 : template <> void copy_fp32(const unsigned int N, const float *X, uint8_t *Y) {
102 0 : copy_fp32_u8(N, X, Y);
103 0 : }
104 :
105 : /**
106 : * @brief copy function : Y = X
107 : * @param[in] N number of elements in X
108 : * @param[in] X float * for Vector X
109 : * @param[in] Y int16_t * for Vector Y
110 : */
111 0 : template <> void copy_fp32(const unsigned int N, const float *X, int16_t *Y) {
112 0 : copy_fp32_s16(N, X, Y);
113 0 : }
114 :
115 : /**
116 : * @brief copy function : Y = X
117 : * @param[in] N number of elements in X
118 : * @param[in] X float * for Vector X
119 : * @param[in] Y int8_t * for Vector Y
120 : */
121 2 : template <> void copy_fp32(const unsigned int N, const float *X, int8_t *Y) {
122 2 : copy_fp32_s8(N, X, Y);
123 2 : }
124 :
125 2 : void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
126 : const unsigned int incX, float *Y,
127 : const unsigned int incY) {
128 2 : __fallback_scopy_uint8_to_float32(N, X, incX, Y, incY);
129 2 : }
130 :
131 5 : void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
132 : const unsigned int incX, float *Y,
133 : const unsigned int incY) {
134 5 : __fallback_scopy_int8_to_float32(N, X, incX, Y, incY);
135 5 : }
136 :
137 : template <>
138 10 : void sine(const unsigned int N, float *X, float *Y, float alpha, float beta) {
139 10 : __fallback_sine(N, X, Y, alpha, beta);
140 10 : }
141 :
142 : template <>
143 13 : void cosine(const unsigned int N, float *X, float *Y, float alpha, float beta) {
144 13 : __fallback_cosine(N, X, Y, alpha, beta);
145 13 : }
146 :
147 0 : void inv_sqrt_inplace(const unsigned int N, float *X) {
148 0 : __fallback_inv_sqrt_inplace(N, X);
149 0 : }
150 :
151 29926 : void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
152 : float alpha, float beta, unsigned int i_stride,
153 : unsigned int o_stride) {
154 29926 : nntrainer::avx2::ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
155 29926 : }
156 :
157 113033 : void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
158 : float alpha, float beta, unsigned int i_stride,
159 : unsigned int o_stride) {
160 113033 : nntrainer::avx2::ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
161 113033 : }
162 :
163 0 : void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
164 : float alpha, float beta, unsigned int i_stride,
165 : unsigned int o_stride) {
166 0 : __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
167 0 : }
168 :
169 333 : void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
170 : float alpha, float beta, unsigned int i_stride,
171 : unsigned int o_stride) {
172 333 : __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
173 333 : }
174 :
175 15254 : void saxpy(const unsigned int N, const float alpha, const float *X,
176 : const unsigned int incX, float *Y, const unsigned int incY) {
177 : #ifdef USE_BLAS
178 15254 : __cblas_saxpy(N, alpha, X, incX, Y, incY);
179 : #else
180 : __fallback_saxpy(N, alpha, X, incX, Y, incY);
181 : #endif
182 15254 : }
183 :
184 88188 : void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
185 : const unsigned int N, const float alpha, const float *A,
186 : const unsigned int lda, const float *X, const unsigned int incX,
187 : const float beta, float *Y, const unsigned int incY) {
188 : #ifdef USE_BLAS
189 88188 : __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
190 : incY);
191 : #else
192 : __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
193 : incY);
194 : #endif
195 88188 : }
196 :
197 245 : float sdot(const unsigned int N, const float *X, const unsigned int incX,
198 : const float *Y, const unsigned int incY) {
199 : #ifdef USE_BLAS
200 245 : return __cblas_sdot(N, X, incX, Y, incY);
201 : #else
202 : return __fallback_sdot(N, X, incX, Y, incY);
203 : #endif
204 : }
205 :
206 0 : void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
207 : uint8_t *Y, const unsigned int incY) {
208 0 : __fallback_scopy(N, X, incX, Y, incY);
209 0 : }
210 :
211 9 : void scopy(const unsigned int N, const int8_t *X, const unsigned int incX,
212 : int8_t *Y, const unsigned int incY) {
213 9 : __fallback_scopy(N, X, incX, Y, incY);
214 9 : }
215 :
216 173980 : void scopy(const unsigned int N, const float *X, const unsigned int incX,
217 : float *Y, const unsigned int incY) {
218 : /// @note cblas_scopy is evoking SIGSEGV for some reason. Use custom
219 : /// implementation instead.
220 : // __cblas_scopy(N, X, incX, Y, incY);
221 173980 : nntrainer::avx2::custom_scopy(N, X, incX, Y, incY);
222 173980 : }
223 :
224 2609 : void sscal(const unsigned int N, const float alpha, float *X,
225 : const unsigned int incX) {
226 : #ifdef USE_BLAS
227 2609 : __cblas_sscal(N, alpha, X, incX);
228 : #else
229 : __fallback_sscal(N, alpha, X, incX);
230 : #endif
231 2609 : }
232 :
233 1931 : float snrm2(const unsigned int N, const float *X, const unsigned int incX) {
234 : #ifdef USE_BLAS
235 1931 : return __cblas_snrm2(N, X, incX);
236 : #else
237 : return __fallback_snrm2(N, X, incX);
238 : #endif
239 : }
240 :
241 16977 : void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
242 : const unsigned int M, const unsigned int N, const unsigned int K,
243 : const float alpha, const float *A, const unsigned int lda,
244 : const float *B, const unsigned int ldb, const float beta, float *C,
245 : const unsigned int ldc) {
246 : #ifdef USE_BLAS
247 16977 : __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
248 : beta, C, ldc);
249 : #else
250 : __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B,
251 : ldb, beta, C, ldc);
252 : #endif
253 16977 : }
254 :
255 3 : unsigned int isamax(const unsigned int N, const float *X,
256 : const unsigned int incX) {
257 : #ifdef USE_BLAS
258 3 : return __cblas_isamax(N, X, incX);
259 : #else
260 : return __fallback_isamax(N, X, incX);
261 : #endif
262 : }
263 25 : void transpose_matrix(const unsigned int M, const unsigned int N,
264 : const float *src, unsigned int ld_src, float *dst,
265 : unsigned int ld_dst) {
266 25 : nntrainer::avx2::transpose_matrix(M, N, src, ld_src, dst, ld_dst);
267 25 : }
268 :
269 12 : bool is_valid(const unsigned int N, const float *input) {
270 12 : return nntrainer::avx2::is_valid(N, input);
271 : }
272 :
273 0 : void unpack_q4_0x8_transpose16(const void *src, uint16_t *d_out,
274 : uint16_t *qs_out, int N, int K) {
275 0 : return nntrainer::avx2::unpack_q4_0x8_transpose16(src, d_out, qs_out, N, K);
276 : }
277 :
278 : template <>
279 0 : void calc_trigonometric_vals_dup(unsigned int N_half, float *angle, float *cos_,
280 : float *sin_, unsigned int from,
281 : float attention_scaling) {
282 0 : __fallback_calc_trigonometric_vals_dup(N_half, angle, cos_, sin_, from,
283 : attention_scaling);
284 0 : }
285 :
286 0 : void swiglu(const unsigned int N, float *X, float *Y, float *Z) {
287 0 : nntrainer::avx2::swiglu(N, X, Y, Z);
288 0 : }
289 :
290 0 : void swiglu(const unsigned int N, float *X, float *Y, float *Z, float alpha) {
291 0 : nntrainer::avx2::swiglu(N, X, Y, Z, alpha);
292 0 : }
293 :
294 0 : float max_val(const unsigned int N, float *X) { return __fallback_max(N, X); }
295 :
296 0 : void softmax(const unsigned int N, float *X, float *Y) {
297 0 : __fallback_softmax(N, X, Y);
298 0 : }
299 :
300 : template <>
301 55 : void gemm_q4_0(const unsigned int M, const unsigned int N, const unsigned int K,
302 : const float *A, const unsigned int lda, const void *B,
303 : const unsigned int ldb, float *C, const unsigned int ldc) {
304 55 : return __ggml_q4_0_8x8_q8_0_GEMM(M, N, K, A, lda, B, ldb, C, ldc);
305 : }
306 :
307 0 : void gemm_q4_0(const unsigned int M, std::vector<unsigned int> Ns,
308 : const unsigned int K, const float *A, const unsigned int lda,
309 : std::vector<void *> Bs, std::vector<unsigned int> ldbs,
310 : std::vector<float *> Cs, std::vector<unsigned int> ldcs) {
311 0 : throw std::runtime_error("Error: NYI for gemm_q4_0 with vectored weights");
312 : }
313 :
314 9 : void gemm_q4_K(const unsigned int M, const unsigned int N, const unsigned int K,
315 : const float *A, const unsigned int lda, const void *B,
316 : const unsigned int ldb, float *C, const unsigned int ldc) {
317 9 : return __ggml_q4_K_8x8_q8_K_GEMM(M, N, K, A, lda, B, ldb, C, ldc);
318 : }
319 :
320 0 : void gemm_q4_K(const unsigned int M, std::vector<unsigned int> Ns,
321 : const unsigned int K, const float *A, const unsigned int lda,
322 : std::vector<void *> Bs, std::vector<unsigned int> ldbs,
323 : std::vector<float *> Cs, std::vector<unsigned int> ldcs) {
324 0 : return __ggml_q4_K_8x8_q8_K_GEMM(M, Ns, K, A, lda, Bs, ldbs, Cs, ldcs);
325 : }
326 :
327 63 : float dot_q6_K_q8_K(const unsigned int K, const void *v_q6_K,
328 : const void *v_q8_K) {
329 63 : return __ggml_vec_dot_q6_K_q8_K(K, v_q6_K, v_q8_K);
330 : }
331 :
332 0 : float dot_q6_K_f32(const unsigned int K, const void *v_q6_K, const float *f) {
333 0 : return __ggml_vec_dot_q6_K_f32(K, v_q6_K, f);
334 : }
335 :
336 : template <>
337 7 : void gemm_q6_K(const unsigned int M, const unsigned int N, const unsigned int K,
338 : const float *A, const unsigned int lda, const void *B,
339 : const unsigned int ldb, float *C, const unsigned int ldc) {
340 7 : return __ggml_gemm_q6_K(M, N, K, A, lda, B, ldb, C, ldc);
341 : }
342 :
343 31 : size_t quantize_q4_0(const float *src, void *dst, int64_t nrow,
344 : int64_t n_per_row, const float *quant_weights) {
345 31 : return __ggml_quantize_q4_0(src, dst, nrow, n_per_row, quant_weights);
346 : }
347 :
348 8 : size_t quantize_q4_K(const float *src, void *dst, int64_t nrow,
349 : int64_t n_per_row, const float *quant_weights) {
350 8 : return __ggml_quantize_q4_K(src, dst, nrow, n_per_row, quant_weights);
351 : }
352 :
353 9 : size_t quantize_q6_K(const float *src, void *dst, int64_t nrow,
354 : int64_t n_per_row, const float *quant_weights) {
355 9 : return __ggml_quantize_q6_K(src, dst, nrow, n_per_row, quant_weights);
356 : }
357 :
358 63 : void quantize_row_q6_K(const float *src, void *dst, int64_t k) {
359 63 : __ggml_quantize_row_q6_K(src, dst, k);
360 63 : }
361 :
362 63 : template <> void quantize_row_q8_K(const float *src, void *dst, int64_t k) {
363 63 : __ggml_quantize_row_q8_K(src, dst, k);
364 63 : }
365 :
366 1 : void dequantize_row_q4_K(const void *x_raw, float *y, int64_t k) {
367 1 : __ggml_dequantize_row_q4_K(x_raw, y, k);
368 1 : }
369 :
370 49 : void dequantize_row_q4_0(const void *x_raw, float *y, int64_t k) {
371 49 : __ggml_dequantize_row_q4_0(x_raw, y, k);
372 49 : }
373 :
374 2 : void dequantize_row_q6_K(const void *x, float *y, int64_t k) {
375 2 : __ggml_dequantize_row_q6_K(x, y, k);
376 2 : }
377 :
378 0 : template <> void dequantize_row_q8_K(const void *x, float *y, int64_t k) {
379 0 : __ggml_dequantize_row_q8_K(x, y, k);
380 0 : }
381 :
382 30 : void repack_q4_0(void *W, void *repacked_W, size_t data_size,
383 : const unsigned int M, const unsigned int N) {
384 30 : __ggml_repack_q4_0_to_q4_0_8(W, repacked_W, data_size, M, N);
385 30 : }
386 :
387 0 : void repack_q4_0_to_q4_0_8(void *W, void *repacked_W, size_t data_size,
388 : const unsigned int M, const unsigned int N) {
389 0 : __ggml_repack_q4_0_to_q4_0_8(W, repacked_W, data_size, M, N);
390 0 : }
391 :
392 7 : void repack_q4_K(void *W, void *repacked_W, size_t data_size,
393 : const unsigned int M, const unsigned int N) {
394 7 : __ggml_repack_q4_K_to_q4_K_8(W, repacked_W, data_size, M, N);
395 7 : }
396 :
397 48 : void unpack_q4_0(const void *in_q4_0x, void *out_q4_0, size_t data_size,
398 : const unsigned int M, const unsigned int N) {
399 48 : Q4_0Utils::unpackBlocksQ4_0x8((const block_q4_0x8 *)in_q4_0x, data_size, M, N,
400 : (block_q4_0 *)out_q4_0);
401 48 : }
402 :
403 : template <>
404 1 : void softmax_row_inplace(float *qk_out, size_t start_row, size_t end_row,
405 : size_t num_heads, float *sink) {
406 1 : nntrainer::avx2::softmax_row_inplace<float>(qk_out, start_row, end_row,
407 : num_heads, sink);
408 1 : }
409 :
410 : template <>
411 1 : void softmax_row(float *qk_out, size_t start_row, size_t end_row,
412 : size_t num_heads, float *sink) {
413 1 : nntrainer::avx2::softmax_row<float>(qk_out, start_row, end_row, num_heads,
414 : sink);
415 1 : }
416 :
417 1 : void compute_fp16vcache_fp32_transposed(int row_num, const float *in,
418 : const uint16_t *vcache, float *output,
419 : int num_cache_head, int gqa_size,
420 : int head_dim,
421 : size_t local_window_size) {
422 1 : nntrainer::avx2::compute_fp16vcache_fp32_transposed(
423 : row_num, in, vcache, output, num_cache_head, gqa_size, head_dim,
424 : local_window_size);
425 1 : }
426 :
427 : template <>
428 1 : void compute_kcaches(const float *in, const uint16_t *kcache, float *output,
429 : int num_rows, int num_cache_head, int head_dim,
430 : int gqa_size, int tile_size, size_t local_window_size) {
431 1 : nntrainer::avx2::compute_kcaches<uint16_t>(in, kcache, output, num_rows,
432 : num_cache_head, head_dim, gqa_size,
433 : tile_size, local_window_size);
434 1 : }
435 :
436 2 : void compute_rotary_emb_value(unsigned int width, unsigned int dim,
437 : unsigned int half_, float *inout, void *output,
438 : const float *cos_, const float *sin_,
439 : bool only_convert_to_fp16) {
440 2 : nntrainer::avx2::compute_rotary_emb_value(width, dim, half_, inout, output,
441 : cos_, sin_, only_convert_to_fp16);
442 2 : }
443 :
444 0 : void rms_norm_wrt_width_fp32_intrinsic(const float *__restrict X,
445 : float *__restrict Y, size_t H, size_t W,
446 : float epsilon) {
447 0 : nntrainer::avx2::rms_norm_wrt_width_fp32_intrinsic(X, Y, H, W, epsilon);
448 0 : }
449 :
450 : template <>
451 0 : void rms_norm_wrt_width_fp16_intrinsic(const float *__restrict X,
452 : float *__restrict Y, size_t H, size_t W,
453 : float epsilon) {
454 0 : __fallback_rms_norm_wrt_width_fp16_intrinsic(X, Y, H, W, epsilon);
455 0 : }
456 :
457 : template <>
458 21 : void clamp(const float *input, float *output, size_t length, float lower_bound,
459 : float upper_bound) {
460 21 : nntrainer::avx2::clamp(input, output, length, lower_bound, upper_bound);
461 21 : }
462 :
463 1752656 : void create_q4_0_weights(const uint8_t *int4_weight, uint8_t *q4_0_weight) {
464 1752656 : nntrainer::avx2::create_q4_0_weights(int4_weight, q4_0_weight);
465 1752656 : }
466 :
467 24 : void transform_q4_0x_from_int4(size_t N, size_t K, const uint8_t *osv32_weights,
468 : const uint16_t *osv32_scales,
469 : size_t scale_group_size, void *dst_q4_0x) {
470 24 : Q4_0Utils::transformQ4_0x_FromInt4(N, K, osv32_weights, osv32_scales,
471 : scale_group_size, 8, dst_q4_0x);
472 24 : }
473 : } /* namespace nntrainer */
|