Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2024 Sungsik Kong <ss.kong@samsung.com>
4 : *
5 : * @file fallback_internal.cpp
6 : * @date 23 April 2024
7 : * @see https://github.com/nnstreamer/nntrainer
8 : * @author Sungsik Kong <ss.kong@samsung.com>
9 : * @bug No known bugs except for NYI items
10 : * @brief Fallback computation functions (raw implementation)
11 : *
12 : */
13 :
14 : #include <algorithm>
15 : #include <assert.h>
16 : #include <climits>
17 : #include <cmath>
18 : #include <cstdint>
19 : #include <fallback_internal.h>
20 : #include <limits>
21 : #include <q4_0_utils.h>
22 : #include <stdexcept>
23 : #include <tensor_dim.h>
24 : #include <util_func.h>
25 :
26 : #define sgemv_loop(ci, cj, cM, cN) \
27 : do { \
28 : float y0; \
29 : unsigned int i, j; \
30 : for (ci = 0; ci != cM; ci++) { \
31 : y0 = 0.0f; \
32 : if (beta != 0.0f) { \
33 : y0 = Y[ci * incY] * beta; \
34 : } \
35 : for (cj = 0; cj != cN; cj++) \
36 : y0 += A[i + j * lda] * X[cj * incX]; \
37 : Y[ci * incY] = y0; \
38 : } \
39 : } while (0);
40 : namespace nntrainer {
41 :
42 : /**
43 : * @brief struct of q4_0x8 block
44 : */
45 : struct block_q4_0x8 {
46 : uint16_t d[8]; // 16B
47 : uint8_t qs[128]; // 16 x u64
48 : };
49 :
50 3 : void __fallback_sscal(const unsigned int N, const float alpha, float *X,
51 : const unsigned int incX) {
52 3 : assert(incX > 0);
53 37 : for (unsigned int i = 0; i < N; ++i)
54 34 : X[i * incX] = alpha * X[i * incX];
55 3 : }
56 :
57 3 : float __fallback_snrm2(const unsigned int N, const float *X,
58 : const unsigned int incX) {
59 3 : assert(incX > 0);
60 : float sum = 0.0f;
61 : float tmp;
62 :
63 110 : for (unsigned int i = 0; i < N; i++) {
64 107 : tmp = X[i * incX];
65 107 : sum += tmp * tmp;
66 : }
67 3 : return sqrt(sum);
68 : }
69 :
70 2 : void __fallback_copy_s16_fp32(const unsigned int N, const int16_t *X,
71 : float *Y) {
72 22 : for (unsigned int i = 0; i < N; ++i) {
73 20 : Y[i] = X[i];
74 : }
75 2 : }
76 :
77 1 : void __fallback_copy_u16_fp32(const unsigned int N, const uint16_t *X,
78 : float *Y) {
79 5 : for (unsigned int i = 0; i < N; ++i) {
80 4 : Y[i] = X[i];
81 : }
82 1 : }
83 :
84 1 : void __fallback_copy_fp32_u32(const unsigned int N, const float *X,
85 : uint32_t *Y) {
86 5 : for (unsigned int i = 0; i < N; ++i) {
87 4 : Y[i] = static_cast<uint32_t>(X[i]);
88 : }
89 1 : }
90 :
91 1 : void __fallback_copy_fp32_u16(const unsigned int N, const float *X,
92 : uint16_t *Y) {
93 5 : for (unsigned int i = 0; i < N; ++i) {
94 4 : Y[i] = static_cast<uint16_t>(X[i]);
95 : }
96 1 : }
97 :
98 1 : void __fallback_copy_fp32_u8(const unsigned int N, const float *X, uint8_t *Y) {
99 5 : for (unsigned int i = 0; i < N; ++i) {
100 4 : Y[i] = static_cast<uint8_t>(X[i]);
101 : }
102 1 : }
103 :
104 1 : void __fallback_copy_fp32_s16(const unsigned int N, const float *X,
105 : int16_t *Y) {
106 5 : for (unsigned int i = 0; i < N; ++i) {
107 4 : Y[i] = static_cast<int16_t>(X[i]);
108 : }
109 1 : }
110 :
111 3 : void __fallback_copy_fp32_s8(const unsigned int N, const float *X, int8_t *Y) {
112 5281 : for (unsigned int i = 0; i < N; ++i) {
113 5278 : Y[i] = static_cast<int8_t>(X[i]);
114 : }
115 3 : }
116 :
117 3 : void __fallback_copy_s16(const unsigned int N, const int16_t *X, int16_t *Y) {
118 113 : for (unsigned int i = 0; i < N; ++i) {
119 110 : Y[i] = X[i];
120 : }
121 3 : }
122 :
123 4 : void __fallback_copy_u16(const unsigned int N, const uint16_t *X, uint16_t *Y) {
124 548 : for (unsigned int i = 0; i < N; ++i) {
125 544 : Y[i] = X[i];
126 : }
127 4 : }
128 :
129 2 : void __fallback_scopy(const unsigned int N, const float *X,
130 : const unsigned int incX, float *Y,
131 : const unsigned int incY) {
132 2 : assert(incX > 0 && incY > 0);
133 26 : for (unsigned int i = 0; i < N; ++i)
134 24 : Y[i * incY] = X[i * incX];
135 2 : }
136 :
137 1 : void __fallback_scopy(const unsigned int N, const uint8_t *X,
138 : const unsigned int incX, uint8_t *Y,
139 : const unsigned int incY) {
140 17 : for (unsigned int idx = 0; idx < N; idx++) {
141 16 : Y[idx * incX] = X[idx * incY];
142 : }
143 1 : }
144 :
145 10 : void __fallback_scopy(const unsigned int N, const int8_t *X,
146 : const unsigned int incX, int8_t *Y,
147 : const unsigned int incY) {
148 636 : for (unsigned int idx = 0; idx < N; idx++) {
149 626 : Y[idx * incX] = X[idx * incY];
150 : }
151 10 : }
152 :
153 1 : void __fallback_scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
154 : const unsigned int incX, float *Y,
155 : const unsigned int incY) {
156 5 : for (unsigned int idx = 0; idx < N; idx++) {
157 4 : Y[2 * idx] = static_cast<float>(X[idx] >> 4);
158 4 : Y[2 * idx + 1] = static_cast<float>(X[idx] & 0x0f);
159 : }
160 1 : }
161 :
162 : /// @todo function with the same internal representation should be merged.
163 2 : void __fallback_scopy_uint8_to_float32(const unsigned int N, const uint8_t *X,
164 : const unsigned int incX, float *Y,
165 : const unsigned int incY) {
166 34 : for (unsigned int idx = 0; idx < N; idx++) {
167 32 : Y[idx * incX] = X[idx * incY];
168 : }
169 2 : }
170 :
171 6 : void __fallback_scopy_int8_to_float32(const unsigned int N, const int8_t *X,
172 : const unsigned int incX, float *Y,
173 : const unsigned int incY) {
174 212 : for (unsigned int idx = 0; idx < N; idx++) {
175 206 : Y[idx * incX] = X[idx * incY];
176 : }
177 6 : }
178 :
179 3 : float __fallback_sdot(const unsigned int N, const float *X,
180 : const unsigned int incX, const float *Y,
181 : const unsigned int incY) {
182 : float ret = 0;
183 14 : for (unsigned int i = 0; i < N; ++i) {
184 11 : ret += X[i * incX] * Y[i * incY];
185 : }
186 3 : return ret;
187 : }
188 :
189 2 : void __fallback_saxpy(const unsigned int N, const float alpha, const float *X,
190 : const unsigned int incX, float *Y,
191 : const unsigned int incY) {
192 2 : assert(incX > 0 && incY > 0);
193 10 : for (unsigned int i = 0; i < N; ++i)
194 8 : Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
195 2 : }
196 :
197 4 : void __fallback_sgemm(const unsigned int TStorageOrder, bool TransA,
198 : bool TransB, const unsigned int M, const unsigned int N,
199 : const unsigned int K, const float alpha, const float *A,
200 : const unsigned int lda, const float *B,
201 : const unsigned int ldb, const float beta, float *C,
202 : const unsigned int ldc) {
203 12 : for (unsigned int m = 0; m < M; ++m) {
204 24 : for (unsigned int n = 0; n < N; ++n) {
205 : double c = 0.0;
206 16 : float c_old = C[m * ldc + n];
207 48 : for (unsigned int k = 0; k < K; ++k) {
208 : float a, b;
209 32 : a = ((TransA == true) ? A[k * lda + m] : A[m * lda + k]);
210 32 : b = ((TransB == true) ? B[n * ldb + k] : B[k * ldb + n]);
211 32 : c += a * b;
212 : }
213 16 : C[m * ldc + n] = alpha * c;
214 16 : if (beta != 0.0f) {
215 4 : C[m * ldc + n] += beta * c_old;
216 : }
217 : }
218 : }
219 4 : }
220 :
221 0 : void __fallback_sgemv(const unsigned int TStorageOrder, bool TransA,
222 : const unsigned int M, const unsigned int N,
223 : const float alpha, const float *A, const unsigned int lda,
224 : const float *X, const unsigned int incX, const float beta,
225 : float *Y, const unsigned int incY) {
226 :
227 0 : if (TransA == true) {
228 0 : sgemv_loop(i, j, N, M);
229 : } else {
230 0 : sgemv_loop(j, i, M, N);
231 : }
232 0 : }
233 :
234 3 : unsigned int __fallback_isamax(const unsigned int N, const float *X,
235 : const unsigned int incX) {
236 : unsigned int max_idx = 0;
237 3 : float max_val = X[0];
238 12 : for (unsigned int n = 1; n < N; n += incX) {
239 9 : float cur_val = std::abs(X[n]);
240 9 : if (cur_val > max_val) {
241 : max_val = cur_val;
242 : max_idx = n;
243 : }
244 : }
245 :
246 3 : return max_idx;
247 : }
248 :
249 : template <>
250 12 : void __fallback_sine(const unsigned int N, float *X, float *Y, float alpha,
251 : float beta) {
252 : unsigned int i = 0;
253 2073635 : while (i < N) {
254 2073623 : Y[i] = std::sin(alpha * X[i]) * beta;
255 2073623 : ++i;
256 : }
257 12 : }
258 :
259 : template <>
260 14 : void __fallback_cosine(const unsigned int N, float *X, float *Y, float alpha,
261 : float beta) {
262 : unsigned int i = 0;
263 2073642 : while (i < N) {
264 2073628 : Y[i] = std::cos(alpha * X[i]) * beta;
265 2073628 : ++i;
266 : }
267 14 : }
268 :
269 1 : void __fallback_inv_sqrt_inplace(const unsigned int N, float *X) {
270 5 : for (unsigned int i = 0; i < N; ++i) {
271 4 : X[i] = 1 / std::sqrt(static_cast<float>(X[i]));
272 : }
273 1 : }
274 :
275 65 : void __fallback_ele_mul(const unsigned int N, const float *X, const float *Y,
276 : float *Z, float alpha, float beta,
277 : unsigned int i_stride, unsigned int o_stride) {
278 193609 : for (unsigned int i = 0; i < N; ++i) {
279 193544 : *Z = *X * alpha * *Y + ((0.0f == beta) ? 0.0f : beta * *Z);
280 193544 : X += o_stride;
281 193544 : Y += i_stride;
282 193544 : Z += o_stride;
283 : }
284 65 : }
285 :
286 64 : void __fallback_ele_add(const unsigned int N, const float *X, const float *Y,
287 : float *Z, float alpha, float beta,
288 : unsigned int i_stride, unsigned int o_stride) {
289 193604 : for (unsigned int i = 0; i < N; ++i) {
290 193540 : *Z = *X + alpha * *Y + ((0.0f == beta) ? 0.0f : beta * *Z);
291 193540 : X += o_stride;
292 193540 : Y += i_stride;
293 193540 : Z += o_stride;
294 : }
295 64 : }
296 :
297 1 : void __fallback_ele_sub(const unsigned N, const float *X, const float *Y,
298 : float *Z, float alpha, float beta,
299 : unsigned int i_stride, unsigned int o_stride) {
300 5 : for (unsigned int i = 0; i < N; ++i) {
301 4 : *Z = *X - alpha * *Y + ((0.0f == beta) ? 0.0f : beta * *Z);
302 4 : X += o_stride;
303 4 : Y += i_stride;
304 4 : Z += o_stride;
305 : }
306 1 : }
307 :
308 334 : void __fallback_ele_div(const unsigned N, const float *X, const float *Y,
309 : float *Z, float alpha, float beta,
310 : unsigned int i_stride, unsigned int o_stride) {
311 3667 : for (unsigned int i = 0; i < N; ++i) {
312 3333 : *Z = *X / (alpha * *Y) + ((0.0f == beta) ? 0.0f : beta * *Z);
313 3333 : X += o_stride;
314 3333 : Y += i_stride;
315 3333 : Z += o_stride;
316 : }
317 334 : }
318 :
319 2 : void __fallback_transpose_matrix(const unsigned int M, const unsigned int N,
320 : const float *src, unsigned int ld_src,
321 : float *dst, unsigned int ld_dst) {
322 7 : for (unsigned int i = 0; i < M; i++) {
323 20 : for (unsigned int j = 0; j < N; j++) {
324 15 : dst[i + j * ld_dst] = src[i * ld_src + j];
325 : }
326 : }
327 2 : }
328 :
329 3 : bool __fallback_isValid(const unsigned int N, const float *X) {
330 9 : for (size_t i = 0; i < N; ++i) {
331 8 : if (!isFloatValid(*X)) {
332 : return false;
333 : }
334 6 : ++X;
335 : }
336 :
337 : return true;
338 : }
339 :
340 0 : void __fallback_unpack_q4_0x8_transpose16(const void *src,
341 : uint16_t *__restrict dT,
342 : uint16_t *__restrict qsT, int N,
343 : int K, int CT) {
344 : const auto *x = static_cast<const block_q4_0x8 *>(src);
345 :
346 0 : const int groups_N8 = N / 8; // # of 8-row groups
347 0 : const int cols_scales = K / 32; // # subblocks along K (scales columns)
348 : const uint64_t mask = 0x8888888888888888ULL; // flip MSB of each nibble
349 :
350 : // Tile over columns to keep working set small.
351 0 : for (int c0 = 0; c0 < cols_scales; c0 += CT) {
352 0 : const int c1 = std::min(c0 + CT, cols_scales);
353 :
354 : // Process rows in natural 8-row groups for source-friendly access
355 0 : for (int b = 0; b < groups_N8; ++b) {
356 : // For each column in the tile, read the source block contiguously
357 0 : for (int c = c0; c < c1; ++c) {
358 0 : const block_q4_0x8 &blk = x[b * cols_scales + c];
359 :
360 : // Precompute column bases in the transposed outputs
361 0 : unsigned short *__restrict dT_c = dT + c * N; // column c in dT
362 : unsigned short *__restrict qsT_c0 =
363 0 : qsT + (c * 8) * N; // first of 8 columns for this subblock
364 :
365 : // Walk the 8 rows inside this block group
366 0 : for (int off = 0; off < 8; ++off) {
367 0 : const int r = b * 8 + off; // absolute row index in [0..N-1]
368 :
369 : // ---------- SCALES (fp16), transposed on the fly ----------
370 0 : dT_c[r] = blk.d[off];
371 :
372 : // ---------- QUANTS (bytes → XOR → swizzle → 8×u16), transposed
373 : // ---------- load two u64 chunks for this row
374 : uint64_t v0, v1;
375 0 : std::memcpy(&v0, blk.qs + 8 * off, 8);
376 0 : std::memcpy(&v1, blk.qs + 8 * (off + 8), 8);
377 0 : v0 ^= mask;
378 0 : v1 ^= mask;
379 :
380 : unsigned char in[16];
381 : std::memcpy(in + 0, &v0, 8);
382 : std::memcpy(in + 8, &v1, 8);
383 :
384 : // nibble-lane swizzle (identical to your reference)
385 : unsigned char out[16];
386 0 : for (int i = 0; i < 8; ++i) {
387 0 : const unsigned char x0 = in[2 * i + 0];
388 0 : const unsigned char x1 = in[2 * i + 1];
389 0 : out[i + 0] = (unsigned char)((x0 & 0x0F) | ((x1 & 0x0F) << 4));
390 0 : out[i + 8] = (unsigned char)(((x0 & 0xF0) >> 4) | (x1 & 0xF0));
391 : }
392 :
393 : // pack to 8×u16 and store to transposed columns j = c*8 .. c*8+7 at
394 : // row r
395 0 : for (int t = 0; t < 8; ++t) {
396 0 : const unsigned short w =
397 0 : (unsigned short)((unsigned short)out[2 * t + 0] |
398 0 : ((unsigned short)out[2 * t + 1] << 8));
399 0 : qsT_c0[t * N + r] = w; // column (c*8 + t), row r
400 : }
401 : } // off
402 : } // c in tile
403 : } // b
404 : } // c0 tiles
405 0 : }
406 :
407 : template <>
408 0 : void __fallback_calc_trigonometric_vals_dup(unsigned int N_half, float *angle,
409 : float *cos_, float *sin_,
410 : unsigned int from,
411 : float attention_scaling) {
412 : throw std::runtime_error(
413 : "Error: No implementation of rotary embedding layer incremental_forwarding "
414 0 : "with SIMD acceleration except for NEON!");
415 : }
416 :
417 1 : void __fallback_swiglu(const unsigned int N, float *X, float *Y, float *Z) {
418 : unsigned int i = 0;
419 5 : while (i < N) {
420 4 : X[i] = (Y[i] / (1.f + std::exp(-Y[i]))) * Z[i];
421 4 : ++i;
422 : }
423 1 : }
424 :
425 1 : void __fallback_swiglu(const unsigned int N, float *X, float *Y, float *Z,
426 : float alpha) {
427 : unsigned int i = 0;
428 5 : while (i < N) {
429 4 : X[i] = (Y[i] / (1.f + std::exp(-alpha * Y[i]))) * Z[i];
430 4 : ++i;
431 : }
432 1 : }
433 :
434 4 : float __fallback_max(const unsigned int N, float *X) {
435 4 : std::vector<float> v(X, X + N);
436 8 : return *std::max_element(v.begin(), v.end());
437 4 : }
438 :
439 2 : void __fallback_softmax(const unsigned int N, float *X, float *Y) {
440 : unsigned int i = 0;
441 : float sum = 0.f;
442 2 : float max_x = __fallback_max(N, X);
443 10 : while (i < N) {
444 8 : sum += std::exp(X[i] - max_x);
445 8 : ++i;
446 : }
447 : i = 0;
448 10 : while (i < N) {
449 8 : Y[i] = std::exp(X[i] - max_x) / sum;
450 8 : ++i;
451 : }
452 2 : }
453 :
454 : template <>
455 0 : void __fallback_gemm_q4_0(const unsigned int M, const unsigned int N,
456 : const unsigned int K, const float *A,
457 : const unsigned int lda, const void *B,
458 : const unsigned int ldb, float *C,
459 : const unsigned int ldc) {
460 0 : throw std::runtime_error("NYI : __fallback_gemm_q4_0");
461 : }
462 :
463 0 : void __fallback_gemm_q4_K(const unsigned int M, const unsigned int N,
464 : const unsigned int K, const float *A,
465 : const unsigned int lda, const void *B,
466 : const unsigned int ldb, float *C,
467 : const unsigned int ldc) {
468 0 : throw std::runtime_error("NYI : __fallback_gemm_q4_K");
469 : }
470 :
471 0 : float __fallback_dot_q6_K_q8_K(const unsigned int K, const void *v_q6_K,
472 : const void *v_q8_K) {
473 0 : throw std::runtime_error("NYI : __fallback_dot_q6_K_q8_K");
474 : return 0;
475 : }
476 :
477 0 : float __fallback_dot_q6_K_f32(const unsigned int K, const void *v_q6_K,
478 : const float *f) {
479 0 : throw std::runtime_error("NYI : __fallback_dot_q6_K_f32");
480 : return 0;
481 : }
482 :
483 : template <>
484 0 : void __fallback_gemm_q6_K(const unsigned int M, const unsigned int N,
485 : const unsigned int K, const float *A,
486 : const unsigned int lda, const void *B,
487 : const unsigned int ldb, float *C,
488 : const unsigned int ldc) {
489 0 : throw std::runtime_error("NYI : __fallback_gemm_q6_K");
490 : }
491 :
492 0 : size_t __fallback_quantize_q4_0(const float *src, void *dst, int64_t nrow,
493 : int64_t n_per_row, const float *quant_weights) {
494 0 : throw std::runtime_error("NYI : __fallback_quantize_q4_0");
495 : return 1;
496 : }
497 :
498 0 : size_t __fallback_quantize_q4_K(const float *src, void *dst, int64_t nrow,
499 : int64_t n_per_row, const float *quant_weights) {
500 0 : throw std::runtime_error("NYI : __fallback_quantize_q4_K");
501 : return 1;
502 : }
503 :
504 0 : size_t __fallback_quantize_q6_K(const float *src, void *dst, int64_t nrow,
505 : int64_t n_per_row, const float *quant_weights) {
506 0 : throw std::runtime_error("NYI : __fallback_quantize_q4_K");
507 : return 1;
508 : }
509 :
510 0 : void __fallback_dequantize_row_q4_K(const void *x_raw, float *y, int64_t k) {
511 0 : throw std::runtime_error("NYI : __fallback_dequantize_row_q4_K");
512 : }
513 :
514 0 : void __fallback_dequantize_row_q4_0(const void *x_raw, float *y, int64_t k) {
515 0 : throw std::runtime_error("NYI : __fallback_dequantize_row_q4_0");
516 : }
517 :
518 0 : void __fallback_dequantize_row_q6_K(const void *x, float *y, int64_t k) {
519 0 : throw std::runtime_error("NYI : __fallback_dequantize_row_q6_K");
520 : }
521 :
522 0 : void __fallback_quantize_row_q6_K(const float *src, void *dst, int64_t k) {
523 0 : throw std::runtime_error("NYI : __fallback_quantize_row_q6_K");
524 : }
525 :
526 : template <>
527 0 : void __fallback_quantize_row_q8_K(const float *src, void *dst, int64_t k) {
528 0 : throw std::runtime_error("NYI : __fallback_quantize_row_q8_K");
529 : }
530 :
531 : template <>
532 0 : void __fallback_dequantize_row_q8_K(const void *x, float *y, int64_t k) {
533 0 : throw std::runtime_error("NYI : __fallback_dequantize_row_q8_K");
534 : }
535 :
536 0 : void __fallback_repack_q4_0_to_q4_0_4(void *W, void *repacked_W,
537 : size_t data_size, const unsigned int M,
538 : const unsigned int N) {
539 0 : throw std::runtime_error("NYI : __fallback_repack_q4_0_to_q4_0_4");
540 : }
541 :
542 0 : void __fallback_repack_q4_0_to_q4_0_8(void *W, void *repacked_W,
543 : size_t data_size, const unsigned int M,
544 : const unsigned int N) {
545 0 : throw std::runtime_error("NYI : __fallback_repack_q4_0_to_q4_0_8");
546 : }
547 :
548 0 : void __fallback_repack_q4_K_to_q4_K_8(void *W, void *repacked_W,
549 : size_t data_size, const unsigned int M,
550 : const unsigned int N) {
551 0 : throw std::runtime_error("NYI : __fallback_repack_q4_K_to_q4_K_8");
552 : }
553 :
554 0 : void __fallback_unpack_q4_0_8_to_q4_0(const void *in_q4_0x, void *out_q4_0,
555 : size_t data_size, const unsigned int M,
556 : const unsigned int N) {
557 0 : throw std::runtime_error("NYI : __fallback_unpack_q4_0_8_to_q4_0");
558 : }
559 :
560 0 : void __fallback_softmax_row_inplace(float *qk_out, size_t start_row,
561 : size_t end_row, size_t num_heads) {
562 0 : throw std::runtime_error("NYI : __fallback_softmax_row_inplace");
563 : }
564 :
565 0 : void __fallback_softmax_row(float *qk_out, size_t start_row, size_t end_row,
566 : size_t num_heads) {
567 0 : throw std::runtime_error("NYI : __fallback_softmax_row");
568 : }
569 :
570 0 : void __fallback_compute_fp16vcache_fp32_transposed(
571 : int row_num, const float *in, const uint16_t *vcache, float *output,
572 : int num_cache_head, int gqa_size, int head_dim, size_t local_window_size) {
573 : throw std::runtime_error(
574 0 : "NYI : __fallback_compute_fp16vcache_fp32_transposed");
575 : }
576 :
577 : template <>
578 0 : void __fallback_compute_kcaches(const float *in, const uint16_t *kcache,
579 : float *output, int num_rows, int num_cache_head,
580 : int head_dim, int gqa_size, int tile_size,
581 : size_t local_window_size) {
582 0 : throw std::runtime_error("NYI : __fallback_compute_kcaches");
583 : }
584 :
585 0 : void __fallback_compute_rotary_emb_value(unsigned int width, unsigned int dim,
586 : unsigned int half_, float *inout,
587 : void *output, const float *cos_,
588 : const float *sin_,
589 : bool only_convert_to_fp16) {
590 0 : throw std::runtime_error("NYI : __fallback_compute_rotary_emb_value");
591 : }
592 :
593 0 : void __fallback_rms_norm_wrt_width_fp32_intrinsic(const float *__restrict X,
594 : float *__restrict Y, size_t H,
595 : size_t W, float epsilon) {
596 : throw std::runtime_error(
597 0 : "NYI : __fallback_rms_norm_wrt_width_fp32_intrinsic");
598 : }
599 :
600 : template <>
601 0 : void __fallback_rms_norm_wrt_width_fp16_intrinsic(const float *__restrict X,
602 : float *__restrict Y, size_t H,
603 : size_t W, float epsilon) {
604 : throw std::runtime_error(
605 0 : "NYI : __fallback_rms_norm_wrt_width_fp16_intrinsic");
606 : }
607 :
608 : template <>
609 22 : void __fallback_clamp(const float *input, float *output, size_t length,
610 : float lower_bound, float upper_bound) {
611 64540 : for (int i = 0; i < length; ++i) {
612 129036 : output[i] = std::clamp(input[i], lower_bound, upper_bound);
613 : }
614 22 : }
615 :
616 0 : void __fallback_create_q4_0_weights(const uint8_t *int4_weight,
617 : uint8_t *q4_0_weight) {
618 0 : for (int i = 0; i < 8; i++) {
619 0 : char v0 = int4_weight[i] & 0xF;
620 0 : char v1 = (int4_weight[i] >> 4) & 0xF;
621 0 : char v2 = int4_weight[8 + i] & 0xF;
622 : char v3 = (int4_weight[8 + i] >> 4) & 0xF;
623 0 : q4_0_weight[2 * i] = (v0 | (v2 << 4));
624 0 : q4_0_weight[2 * i + 1] = (v1 | (v3 << 4));
625 : }
626 0 : }
627 :
628 0 : void __fallback_transform_q4_0x_from_int4(size_t N, size_t K,
629 : const uint8_t *osv32_weights,
630 : const uint16_t *osv32_scales,
631 : size_t scale_group_size,
632 : void *dst_q4_0x) {
633 0 : Q4_0Utils::transformQ4_0x_FromInt4(N, K, osv32_weights, osv32_scales,
634 : scale_group_size, 8, dst_q4_0x);
635 0 : }
636 :
637 : } // namespace nntrainer
|