Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
4 : *
5 : * @file x86_compute_backend.cpp
6 : * @date 23 April 2024
7 : * @see https://github.com/nntrainer/nntrainer
8 : * @author Sungsik Kong <ss.kong@samsung.com>
9 : * @bug No known bugs except for NYI items
10 : * @brief Compute backend for x86
11 : *
12 : */
13 :
14 : #include <assert.h>
15 :
16 : #include <avx2_impl.h>
17 : #ifdef USE_BLAS
18 : #include <cblas_interface.h>
19 : #endif
20 : #include <fallback_internal.h>
21 : #include <ggml_interface.h>
22 : #include <nntrainer_error.h>
23 : #include <q4_0_utils.h>
24 : #include <x86_compute_backend.h>
25 :
26 : #define ROW_MAJOR 0
27 : #define COL_MAJOR 1
28 :
29 : namespace nntrainer {
30 :
31 61 : void init_backend() { __ggml_init();
32 : // Do not repeatedly call set_num_threads. It's a global config.
33 61 : __openblas_set_num_threads(-1); // -1 = BLAS_NUM_THREADS if defined.
34 61 : }
35 :
36 0 : void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
37 : const unsigned int incX, float *Y,
38 : const unsigned int incY) {
39 0 : __fallback_scopy_int4_to_float32(N, X, incX, Y, incY);
40 0 : }
41 :
42 2 : void copy_s16(const unsigned int N, const int16_t *X, int16_t *Y) {
43 2 : __fallback_copy_s16(N, X, Y);
44 2 : }
45 :
46 3 : void copy_u16(const unsigned int N, const uint16_t *X, uint16_t *Y) {
47 3 : __fallback_copy_u16(N, X, Y);
48 3 : }
49 :
50 1 : void copy_s16_fp32(const unsigned int N, const int16_t *X, float *Y) {
51 1 : __fallback_copy_s16_fp32(N, X, Y);
52 1 : }
53 :
54 1 : void copy_u16_fp32(const unsigned int N, const uint16_t *X, float *Y) {
55 1 : nntrainer::avx2::copy_f16_f32(N, X, Y);
56 1 : }
57 :
58 0 : void copy_fp32_u32(const unsigned int N, const float *X, uint32_t *Y) {
59 0 : __fallback_copy_fp32_u32(N, X, Y);
60 0 : }
61 :
62 0 : void copy_fp32_u16(const unsigned int N, const float *X, uint16_t *Y) {
63 0 : nntrainer::avx2::copy_f32_f16(N, X, Y);
64 0 : }
65 :
66 0 : void copy_fp32_u8(const unsigned int N, const float *X, uint8_t *Y) {
67 0 : __fallback_copy_fp32_u8(N, X, Y);
68 0 : }
69 :
70 0 : void copy_fp32_s16(const unsigned int N, const float *X, int16_t *Y) {
71 0 : __fallback_copy_fp32_s16(N, X, Y);
72 0 : }
73 :
74 2 : void copy_fp32_s8(const unsigned int N, const float *X, int8_t *Y) {
75 2 : __fallback_copy_fp32_s8(N, X, Y);
76 2 : }
77 :
78 : /**
79 : * @brief copy function : Y = X
80 : * @param[in] N number of elements in X
81 : * @param[in] X float * for Vector X
82 : * @param[in] Y uint32_t * for Vector Y
83 : */
84 0 : template <> void copy_fp32(const unsigned int N, const float *X, uint32_t *Y) {
85 0 : copy_fp32_u32(N, X, Y);
86 0 : }
87 :
88 : /**
89 : * @brief copy function : Y = X
90 : * @param[in] N number of elements in X
91 : * @param[in] X float * for Vector X
92 : * @param[in] Y uint16_t * for Vector Y
93 : */
94 0 : template <> void copy_fp32(const unsigned int N, const float *X, uint16_t *Y) {
95 0 : copy_fp32_u16(N, X, Y);
96 0 : }
97 :
98 : /**
99 : * @brief copy function : Y = X
100 : * @param[in] N number of elements in X
101 : * @param[in] X float * for Vector X
102 : * @param[in] Y uint16_t * for Vector Y
103 : */
104 0 : template <> void copy_fp32(const unsigned int N, const float *X, uint8_t *Y) {
105 0 : copy_fp32_u8(N, X, Y);
106 0 : }
107 :
108 : /**
109 : * @brief copy function : Y = X
110 : * @param[in] N number of elements in X
111 : * @param[in] X float * for Vector X
112 : * @param[in] Y int16_t * for Vector Y
113 : */
114 0 : template <> void copy_fp32(const unsigned int N, const float *X, int16_t *Y) {
115 0 : copy_fp32_s16(N, X, Y);
116 0 : }
117 :
118 : /**
119 : * @brief copy function : Y = X
120 : * @param[in] N number of elements in X
121 : * @param[in] X float * for Vector X
122 : * @param[in] Y int8_t * for Vector Y
123 : */
124 2 : template <> void copy_fp32(const unsigned int N, const float *X, int8_t *Y) {
125 2 : copy_fp32_s8(N, X, Y);
126 2 : }
127 :
128 2 : void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
129 : const unsigned int incX, float *Y,
130 : const unsigned int incY) {
131 2 : __fallback_scopy_uint8_to_float32(N, X, incX, Y, incY);
132 2 : }
133 :
134 5 : void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
135 : const unsigned int incX, float *Y,
136 : const unsigned int incY) {
137 5 : __fallback_scopy_int8_to_float32(N, X, incX, Y, incY);
138 5 : }
139 :
140 : template <>
141 10 : void sine(const unsigned int N, float *X, float *Y, float alpha, float beta) {
142 10 : __fallback_sine(N, X, Y, alpha, beta);
143 10 : }
144 :
145 : template <>
146 13 : void cosine(const unsigned int N, float *X, float *Y, float alpha, float beta) {
147 13 : __fallback_cosine(N, X, Y, alpha, beta);
148 13 : }
149 :
150 0 : void inv_sqrt_inplace(const unsigned int N, float *X) {
151 0 : __fallback_inv_sqrt_inplace(N, X);
152 0 : }
153 :
154 29926 : void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
155 : float alpha, float beta, unsigned int i_stride,
156 : unsigned int o_stride) {
157 29926 : nntrainer::avx2::ele_mul(N, X, Y, Z, alpha, beta, i_stride, o_stride);
158 29926 : }
159 :
160 113033 : void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
161 : float alpha, float beta, unsigned int i_stride,
162 : unsigned int o_stride) {
163 113033 : nntrainer::avx2::ele_add(N, X, Y, Z, alpha, beta, i_stride, o_stride);
164 113033 : }
165 :
166 0 : void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
167 : float alpha, float beta, unsigned int i_stride,
168 : unsigned int o_stride) {
169 0 : __fallback_ele_sub(N, X, Y, Z, alpha, beta, i_stride, o_stride);
170 0 : }
171 :
172 333 : void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
173 : float alpha, float beta, unsigned int i_stride,
174 : unsigned int o_stride) {
175 333 : __fallback_ele_div(N, X, Y, Z, alpha, beta, i_stride, o_stride);
176 333 : }
177 :
178 15254 : void saxpy(const unsigned int N, const float alpha, const float *X,
179 : const unsigned int incX, float *Y, const unsigned int incY) {
180 : #ifdef USE_BLAS
181 15254 : __cblas_saxpy(N, alpha, X, incX, Y, incY);
182 : #else
183 : __fallback_saxpy(N, alpha, X, incX, Y, incY);
184 : #endif
185 15254 : }
186 :
187 88188 : void sgemv(const unsigned int TStorageOrder, bool TransA, const unsigned int M,
188 : const unsigned int N, const float alpha, const float *A,
189 : const unsigned int lda, const float *X, const unsigned int incX,
190 : const float beta, float *Y, const unsigned int incY) {
191 : #ifdef USE_BLAS
192 88188 : __cblas_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
193 : incY);
194 : #else
195 : __fallback_sgemv(TStorageOrder, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
196 : incY);
197 : #endif
198 88188 : }
199 :
200 245 : float sdot(const unsigned int N, const float *X, const unsigned int incX,
201 : const float *Y, const unsigned int incY) {
202 : #ifdef USE_BLAS
203 245 : return __cblas_sdot(N, X, incX, Y, incY);
204 : #else
205 : return __fallback_sdot(N, X, incX, Y, incY);
206 : #endif
207 : }
208 :
209 0 : void scopy(const unsigned int N, const uint8_t *X, const unsigned int incX,
210 : uint8_t *Y, const unsigned int incY) {
211 0 : __fallback_scopy(N, X, incX, Y, incY);
212 0 : }
213 :
214 9 : void scopy(const unsigned int N, const int8_t *X, const unsigned int incX,
215 : int8_t *Y, const unsigned int incY) {
216 9 : __fallback_scopy(N, X, incX, Y, incY);
217 9 : }
218 :
219 173980 : void scopy(const unsigned int N, const float *X, const unsigned int incX,
220 : float *Y, const unsigned int incY) {
221 : /// @note cblas_scopy is evoking SIGSEGV for some reason. Use custom
222 : /// implementation instead.
223 : // __cblas_scopy(N, X, incX, Y, incY);
224 173980 : nntrainer::avx2::custom_scopy(N, X, incX, Y, incY);
225 173980 : }
226 :
227 2613 : void sscal(const unsigned int N, const float alpha, float *X,
228 : const unsigned int incX) {
229 : #ifdef USE_BLAS
230 2613 : __cblas_sscal(N, alpha, X, incX);
231 : #else
232 : __fallback_sscal(N, alpha, X, incX);
233 : #endif
234 2613 : }
235 :
236 1935 : float snrm2(const unsigned int N, const float *X, const unsigned int incX) {
237 : #ifdef USE_BLAS
238 1935 : return __cblas_snrm2(N, X, incX);
239 : #else
240 : return __fallback_snrm2(N, X, incX);
241 : #endif
242 : }
243 :
244 16977 : void sgemm(const unsigned int TStorageOrder, bool TransA, bool TransB,
245 : const unsigned int M, const unsigned int N, const unsigned int K,
246 : const float alpha, const float *A, const unsigned int lda,
247 : const float *B, const unsigned int ldb, const float beta, float *C,
248 : const unsigned int ldc) {
249 : #ifdef USE_BLAS
250 16977 : __cblas_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
251 : beta, C, ldc);
252 : #else
253 : __fallback_sgemm(TStorageOrder, TransA, TransB, M, N, K, alpha, A, lda, B,
254 : ldb, beta, C, ldc);
255 : #endif
256 16977 : }
257 :
258 3 : unsigned int isamax(const unsigned int N, const float *X,
259 : const unsigned int incX) {
260 : #ifdef USE_BLAS
261 3 : return __cblas_isamax(N, X, incX);
262 : #else
263 : return __fallback_isamax(N, X, incX);
264 : #endif
265 : }
266 25 : void transpose_matrix(const unsigned int M, const unsigned int N,
267 : const float *src, unsigned int ld_src, float *dst,
268 : unsigned int ld_dst) {
269 25 : nntrainer::avx2::transpose_matrix(M, N, src, ld_src, dst, ld_dst);
270 25 : }
271 :
272 12 : bool is_valid(const unsigned int N, const float *input) {
273 12 : return nntrainer::avx2::is_valid(N, input);
274 : }
275 :
276 0 : void unpack_q4_0x8_transpose16(const void *src, uint16_t *d_out,
277 : uint16_t *qs_out, int N, int K) {
278 0 : return nntrainer::avx2::unpack_q4_0x8_transpose16(src, d_out, qs_out, N, K);
279 : }
280 :
281 : template <>
282 0 : void calc_trigonometric_vals_dup(unsigned int N_half, float *angle, float *cos_,
283 : float *sin_, unsigned int from,
284 : float attention_scaling) {
285 0 : __fallback_calc_trigonometric_vals_dup(N_half, angle, cos_, sin_, from,
286 : attention_scaling);
287 0 : }
288 :
289 0 : void swiglu(const unsigned int N, float *X, float *Y, float *Z) {
290 0 : nntrainer::avx2::swiglu(N, X, Y, Z);
291 0 : }
292 :
293 0 : void swiglu(const unsigned int N, float *X, float *Y, float *Z, float alpha) {
294 0 : nntrainer::avx2::swiglu(N, X, Y, Z, alpha);
295 0 : }
296 :
297 0 : float max_val(const unsigned int N, float *X) { return __fallback_max(N, X); }
298 :
299 0 : void softmax(const unsigned int N, float *X, float *Y) {
300 0 : __fallback_softmax(N, X, Y);
301 0 : }
302 :
303 : template <>
304 55 : void gemm_q4_0(const unsigned int M, const unsigned int N, const unsigned int K,
305 : const float *A, const unsigned int lda, const void *B,
306 : const unsigned int ldb, float *C, const unsigned int ldc) {
307 55 : return __ggml_q4_0_8x8_q8_0_GEMM(M, N, K, A, lda, B, ldb, C, ldc);
308 : }
309 :
310 0 : void gemm_q4_0(const unsigned int M, std::vector<unsigned int> Ns,
311 : const unsigned int K, const float *A, const unsigned int lda,
312 : std::vector<void *> Bs, std::vector<unsigned int> ldbs,
313 : std::vector<float *> Cs, std::vector<unsigned int> ldcs) {
314 0 : throw std::runtime_error("Error: NYI for gemm_q4_0 with vectored weights");
315 : }
316 :
317 9 : void gemm_q4_K(const unsigned int M, const unsigned int N, const unsigned int K,
318 : const float *A, const unsigned int lda, const void *B,
319 : const unsigned int ldb, float *C, const unsigned int ldc) {
320 9 : return __ggml_q4_K_8x8_q8_K_GEMM(M, N, K, A, lda, B, ldb, C, ldc);
321 : }
322 :
323 0 : void gemm_q4_K(const unsigned int M, std::vector<unsigned int> Ns,
324 : const unsigned int K, const float *A, const unsigned int lda,
325 : std::vector<void *> Bs, std::vector<unsigned int> ldbs,
326 : std::vector<float *> Cs, std::vector<unsigned int> ldcs) {
327 0 : return __ggml_q4_K_8x8_q8_K_GEMM(M, Ns, K, A, lda, Bs, ldbs, Cs, ldcs);
328 : }
329 :
330 63 : float dot_q6_K_q8_K(const unsigned int K, const void *v_q6_K,
331 : const void *v_q8_K) {
332 63 : return __ggml_vec_dot_q6_K_q8_K(K, v_q6_K, v_q8_K);
333 : }
334 :
335 0 : float dot_q6_K_f32(const unsigned int K, const void *v_q6_K, const float *f) {
336 0 : return __ggml_vec_dot_q6_K_f32(K, v_q6_K, f);
337 : }
338 :
339 : template <>
340 7 : void gemm_q6_K(const unsigned int M, const unsigned int N, const unsigned int K,
341 : const float *A, const unsigned int lda, const void *B,
342 : const unsigned int ldb, float *C, const unsigned int ldc) {
343 7 : return __ggml_gemm_q6_K(M, N, K, A, lda, B, ldb, C, ldc);
344 : }
345 :
346 31 : size_t quantize_q4_0(const float *src, void *dst, int64_t nrow,
347 : int64_t n_per_row, const float *quant_weights) {
348 31 : return __ggml_quantize_q4_0(src, dst, nrow, n_per_row, quant_weights);
349 : }
350 :
351 8 : size_t quantize_q4_K(const float *src, void *dst, int64_t nrow,
352 : int64_t n_per_row, const float *quant_weights) {
353 8 : return __ggml_quantize_q4_K(src, dst, nrow, n_per_row, quant_weights);
354 : }
355 :
356 9 : size_t quantize_q6_K(const float *src, void *dst, int64_t nrow,
357 : int64_t n_per_row, const float *quant_weights) {
358 9 : return __ggml_quantize_q6_K(src, dst, nrow, n_per_row, quant_weights);
359 : }
360 :
361 63 : void quantize_row_q6_K(const float *src, void *dst, int64_t k) {
362 63 : __ggml_quantize_row_q6_K(src, dst, k);
363 63 : }
364 :
365 63 : template <> void quantize_row_q8_K(const float *src, void *dst, int64_t k) {
366 63 : __ggml_quantize_row_q8_K(src, dst, k);
367 63 : }
368 :
369 1 : void dequantize_row_q4_K(const void *x_raw, float *y, int64_t k) {
370 1 : __ggml_dequantize_row_q4_K(x_raw, y, k);
371 1 : }
372 :
373 49 : void dequantize_row_q4_0(const void *x_raw, float *y, int64_t k) {
374 49 : __ggml_dequantize_row_q4_0(x_raw, y, k);
375 49 : }
376 :
377 2 : void dequantize_row_q6_K(const void *x, float *y, int64_t k) {
378 2 : __ggml_dequantize_row_q6_K(x, y, k);
379 2 : }
380 :
381 0 : template <> void dequantize_row_q8_K(const void *x, float *y, int64_t k) {
382 0 : __ggml_dequantize_row_q8_K(x, y, k);
383 0 : }
384 :
385 30 : void repack_q4_0(void *W, void *repacked_W, size_t data_size,
386 : const unsigned int M, const unsigned int N) {
387 30 : __ggml_repack_q4_0_to_q4_0_8(W, repacked_W, data_size, M, N);
388 30 : }
389 :
390 0 : void repack_q4_0_to_q4_0_8(void *W, void *repacked_W, size_t data_size,
391 : const unsigned int M, const unsigned int N) {
392 0 : __ggml_repack_q4_0_to_q4_0_8(W, repacked_W, data_size, M, N);
393 0 : }
394 :
395 7 : void repack_q4_K(void *W, void *repacked_W, size_t data_size,
396 : const unsigned int M, const unsigned int N) {
397 7 : __ggml_repack_q4_K_to_q4_K_8(W, repacked_W, data_size, M, N);
398 7 : }
399 :
400 48 : void unpack_q4_0(const void *in_q4_0x, void *out_q4_0, size_t data_size,
401 : const unsigned int M, const unsigned int N) {
402 48 : Q4_0Utils::unpackBlocksQ4_0x8((const block_q4_0x8 *)in_q4_0x, data_size, M, N,
403 : (block_q4_0 *)out_q4_0);
404 48 : }
405 :
406 : template <>
407 1 : void softmax_row_inplace(float *qk_out, size_t start_row, size_t end_row,
408 : size_t num_heads, float *sink) {
409 1 : nntrainer::avx2::softmax_row_inplace<float>(qk_out, start_row, end_row,
410 : num_heads, sink);
411 1 : }
412 :
413 : template <>
414 1 : void softmax_row(float *qk_out, size_t start_row, size_t end_row,
415 : size_t num_heads, float *sink) {
416 1 : nntrainer::avx2::softmax_row<float>(qk_out, start_row, end_row, num_heads,
417 : sink);
418 1 : }
419 :
420 1 : void compute_fp16vcache_fp32_transposed(int row_num, const float *in,
421 : const uint16_t *vcache, float *output,
422 : int num_cache_head, int gqa_size,
423 : int head_dim,
424 : size_t local_window_size) {
425 1 : nntrainer::avx2::compute_fp16vcache_fp32_transposed(
426 : row_num, in, vcache, output, num_cache_head, gqa_size, head_dim,
427 : local_window_size);
428 1 : }
429 :
430 : template <>
431 1 : void compute_kcaches(const float *in, const uint16_t *kcache, float *output,
432 : int num_rows, int num_cache_head, int head_dim,
433 : int gqa_size, int tile_size, size_t local_window_size) {
434 1 : nntrainer::avx2::compute_kcaches<uint16_t>(in, kcache, output, num_rows,
435 : num_cache_head, head_dim, gqa_size,
436 : tile_size, local_window_size);
437 1 : }
438 :
439 2 : void compute_rotary_emb_value(unsigned int width, unsigned int dim,
440 : unsigned int half_, float *inout, void *output,
441 : const float *cos_, const float *sin_,
442 : bool only_convert_to_fp16) {
443 2 : nntrainer::avx2::compute_rotary_emb_value(width, dim, half_, inout, output,
444 : cos_, sin_, only_convert_to_fp16);
445 2 : }
446 :
447 0 : void rms_norm_wrt_width_fp32_intrinsic(const float *__restrict X,
448 : float *__restrict Y, size_t H, size_t W,
449 : float epsilon) {
450 0 : nntrainer::avx2::rms_norm_wrt_width_fp32_intrinsic(X, Y, H, W, epsilon);
451 0 : }
452 :
453 : template <>
454 0 : void rms_norm_wrt_width_fp16_intrinsic(const float *__restrict X,
455 : float *__restrict Y, size_t H, size_t W,
456 : float epsilon) {
457 0 : __fallback_rms_norm_wrt_width_fp16_intrinsic(X, Y, H, W, epsilon);
458 0 : }
459 :
460 : template <>
461 21 : void clamp(const float *input, float *output, size_t length, float lower_bound,
462 : float upper_bound) {
463 21 : nntrainer::avx2::clamp(input, output, length, lower_bound, upper_bound);
464 21 : }
465 :
466 0 : void create_q4_0_weights(const uint8_t *int4_weight, uint8_t *q4_0_weight) {
467 0 : nntrainer::avx2::create_q4_0_weights(int4_weight, q4_0_weight);
468 0 : }
469 :
470 2400 : void transform_int4_osv32_isv2_to_q4_0(size_t N, size_t K,
471 : const uint8_t *osv32_weights,
472 : const uint16_t *osv32_scales,
473 : size_t scale_group_size,
474 : void *dst_q4_0x) {
475 : #ifdef __AVX2__
476 2400 : nntrainer::avx2::transform_int4_osv32_isv2_to_q4_0x8(
477 : N, K, osv32_weights, osv32_scales, scale_group_size, dst_q4_0x);
478 : #else
479 : __fallback_transform_int4_osv32_isv2_to_q4_0(
480 : N, K, osv32_weights, osv32_scales, scale_group_size, 8, dst_q4_0x);
481 : #endif
482 2400 : }
483 : } /* namespace nntrainer */
|