Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * @file q4_0_utils.cpp
4 : * @date 15 October 2025
5 : * @brief This is Q4_0Utils class for utils for Q4_0 quantization format.
6 : * @see https://github.com/nnstreamer/nntrainer
7 : * @author Maciej Nalewaj <m.nalewaj@samsung.com>
8 : * @bug No known bugs
9 : */
10 :
11 : #include <cassert>
12 : #include <cmath>
13 :
14 : #include "cpu_backend.h"
15 : #include "fp16.h"
16 : #include "int4_utils.h"
17 : #include "nntrainer_error.h"
18 : #include "q4_0_utils.h"
19 : #include "util_func.h"
20 :
21 : #ifdef __AVX2__
22 : #include <immintrin.h>
23 : #endif
24 :
25 : namespace nntrainer {
26 :
27 0 : void Q4_0Utils::unpackOneBlockQ4_0x4(const block_q4_0x4 *in, block_q4_0 *dst) {
28 : unsigned int blck_size_interleave = 8;
29 :
30 0 : for (int i = 0; i < 4; i++) {
31 0 : dst[i].d = in->d[i];
32 : }
33 :
34 : const int end = QK4_0 * 2 / blck_size_interleave;
35 : const uint64_t xor_mask = 0x8888888888888888ULL;
36 :
37 0 : for (int i = 0; i < end; ++i) {
38 0 : int dst_id = i % 4;
39 0 : int dst_offset = (i / 4) * blck_size_interleave;
40 0 : int src_offset = i * blck_size_interleave;
41 :
42 : uint64_t elems;
43 0 : memcpy(&elems, &in->qs[src_offset], sizeof(uint64_t));
44 0 : elems ^= xor_mask;
45 0 : memcpy(&dst[dst_id].qs[dst_offset], &elems, sizeof(uint64_t));
46 : }
47 0 : }
48 :
49 0 : void Q4_0Utils::unpackBlocksQ4_0x4(const block_q4_0x4 *__restrict src,
50 : size_t data_size, size_t nrow, size_t K,
51 : block_q4_0 *__restrict dst) {
52 : int interleave_block = 4;
53 :
54 : const block_q4_0x4 *src_ = src;
55 : block_q4_0 *dst_ = (block_q4_0 *)dst;
56 : block_q4_0 dst_tmp[4];
57 0 : int nblocks = K / QK4_0;
58 :
59 0 : assert(data_size == (nrow / 4) * nblocks * sizeof(block_q4_0x4));
60 :
61 0 : for (size_t b = 0; b < nrow; b += interleave_block) {
62 0 : for (int64_t x = 0; x < nblocks; x++) {
63 0 : unpackOneBlockQ4_0x4(src_++, dst_tmp);
64 :
65 0 : for (size_t i = 0; i < interleave_block; i++) {
66 0 : dst_[x + i * nblocks] = dst_tmp[i];
67 : }
68 : }
69 0 : dst_ += interleave_block * nblocks;
70 : }
71 0 : }
72 :
73 0 : void Q4_0Utils::dequantizeQ4_0x4(const void *q4_weight_repacked, int N, int K,
74 : float *dequantized_weights) {
75 0 : assert(K % QK4_0 == 0);
76 0 : assert(N % 4 == 0);
77 0 : size_t data_size = (K / QK4_0) * (N / 4) * sizeof(block_q4_0x4);
78 0 : std::vector<uint8_t> q4_weight_out(data_size);
79 0 : unpackBlocksQ4_0x4((block_q4_0x4 *)q4_weight_repacked, data_size, N, K,
80 : (block_q4_0 *)q4_weight_out.data());
81 :
82 0 : nntrainer::dequantize_row_q4_0((const void *)q4_weight_out.data(),
83 0 : dequantized_weights, K * N);
84 0 : }
85 :
86 438164 : void Q4_0Utils::unpackOneBlockQ4_0x8(const block_q4_0x8 *in, block_q4_0 *dst) {
87 : unsigned int blck_size_interleave = 8;
88 :
89 3943476 : for (int i = 0; i < 8; i++) {
90 3505312 : dst[i].d = in->d[i];
91 : }
92 :
93 : const int end = QK4_0 * 4 / blck_size_interleave;
94 : const uint64_t xor_mask = 0x8888888888888888ULL;
95 :
96 7448788 : for (int i = 0; i < end; ++i) {
97 7010624 : int dst_id = i % 8;
98 7010624 : int dst_offset = (i / 8) * blck_size_interleave;
99 7010624 : int src_offset = i * blck_size_interleave;
100 :
101 : uint64_t elems;
102 7010624 : memcpy(&elems, &in->qs[src_offset], sizeof(uint64_t));
103 7010624 : elems ^= xor_mask;
104 7010624 : memcpy(&dst[dst_id].qs[dst_offset], &elems, sizeof(uint64_t));
105 : }
106 438164 : }
107 :
108 48 : void Q4_0Utils::unpackBlocksQ4_0x8(const block_q4_0x8 *__restrict src,
109 : size_t data_size, size_t nrow, size_t K,
110 : block_q4_0 *__restrict dst) {
111 : int interleave_block = 8;
112 :
113 : const block_q4_0x8 *src_ = src;
114 : block_q4_0 *dst_ = (block_q4_0 *)dst;
115 : block_q4_0 dst_tmp[8];
116 48 : int nblocks = K / QK4_0;
117 :
118 48 : assert(data_size == (nrow / 8) * nblocks * sizeof(block_q4_0x8));
119 :
120 5880 : for (size_t b = 0; b < nrow; b += interleave_block) {
121 443996 : for (int64_t x = 0; x < nblocks; x++) {
122 438164 : unpackOneBlockQ4_0x8(src_++, dst_tmp);
123 :
124 3943476 : for (size_t i = 0; i < interleave_block; i++) {
125 3505312 : dst_[x + i * nblocks] = dst_tmp[i];
126 : }
127 : }
128 5832 : dst_ += interleave_block * nblocks;
129 : }
130 48 : }
131 :
132 0 : void Q4_0Utils::dequantizeQ4_0x8(const void *q4_weight_repacked, int N, int K,
133 : float *dequantized_weights) {
134 0 : assert(K % QK4_0 == 0);
135 0 : assert(N % 8 == 0);
136 0 : size_t data_size = (K / QK4_0) * (N / 8) * sizeof(block_q4_0x8);
137 0 : std::vector<uint8_t> q4_weight_out(data_size);
138 0 : unpackBlocksQ4_0x8((block_q4_0x8 *)q4_weight_repacked, data_size, N, K,
139 : (block_q4_0 *)q4_weight_out.data());
140 :
141 0 : nntrainer::dequantize_row_q4_0((const void *)q4_weight_out.data(),
142 0 : dequantized_weights, K * N);
143 0 : }
144 :
145 0 : inline static void nntr_make_block_q4_0x4(const block_q4_0 *in,
146 : block_q4_0x4 *out) {
147 : constexpr size_t IN_CNT = 4;
148 : constexpr size_t HALF_SIZE = 8;
149 :
150 0 : for (int i = 0; i < IN_CNT; ++i) {
151 0 : out->d[i] = in[i].d;
152 : }
153 :
154 0 : for (int i = 0; i < IN_CNT; ++i) {
155 0 : memcpy(&out->qs[i * HALF_SIZE], &in[i].qs[0], HALF_SIZE);
156 : }
157 0 : for (int i = 0; i < IN_CNT; ++i) {
158 0 : memcpy(&out->qs[IN_CNT * HALF_SIZE + i * HALF_SIZE], &in[i].qs[8],
159 : HALF_SIZE);
160 : }
161 0 : }
162 :
163 219082 : inline static void nntr_make_block_q4_0x8(const block_q4_0 *in,
164 : block_q4_0x8 *out) {
165 : constexpr size_t IN_CNT = 8;
166 : constexpr size_t HALF_SIZE = 8;
167 :
168 1971738 : for (int i = 0; i < IN_CNT; ++i) {
169 1752656 : out->d[i] = in[i].d;
170 : }
171 :
172 1971738 : for (int i = 0; i < IN_CNT; ++i) {
173 1752656 : memcpy(&out->qs[i * HALF_SIZE], &in[i].qs[0], HALF_SIZE);
174 : }
175 1971738 : for (int i = 0; i < IN_CNT; ++i) {
176 1752656 : memcpy(&out->qs[IN_CNT * HALF_SIZE + i * HALF_SIZE], &in[i].qs[8],
177 : HALF_SIZE);
178 : }
179 219082 : }
180 :
181 24 : void Q4_0Utils::transformQ4_0x_FromInt4(size_t N, size_t K,
182 : const uint8_t *osv32_weights,
183 : const uint16_t *osv32_scales,
184 : size_t scale_group_size,
185 : int q4_0x_block_size, void *dst_q4_0x) {
186 :
187 24 : NNTR_THROW_IF((!(scale_group_size == 32 || scale_group_size == 64 ||
188 : scale_group_size == 128)),
189 : std::invalid_argument)
190 : << "Scale group size must be 32/64/128";
191 24 : NNTR_THROW_IF(K % QK4_0 != 0, std::invalid_argument)
192 : << "K size must be divisable by QK4_0 (32)";
193 24 : NNTR_THROW_IF(N % 8 != 0, std::invalid_argument)
194 : << "N size must be divisable by 8";
195 24 : NNTR_THROW_IF((!(q4_0x_block_size == 4 || q4_0x_block_size == 8)),
196 : std::invalid_argument)
197 : << "q4_0x_block_size must be 4 or 8";
198 :
199 : static constexpr const size_t ROW_BLOCK_SIZE = 32;
200 : static constexpr const size_t COLUMN_BLOCK_SIZE = 2;
201 :
202 : uint8_t int4_weight[16];
203 : uint16_t scale;
204 : block_q4_0 dst_tmp[8];
205 : uint8_t *dst_ = reinterpret_cast<uint8_t *>(dst_q4_0x);
206 :
207 : // --- Layout ---
208 24 : const size_t rows_count_pad = align(N, ROW_BLOCK_SIZE);
209 24 : const size_t columns_count_pad = align(K, ROW_BLOCK_SIZE);
210 24 : const size_t column_blocks_count =
211 : columns_count_pad / COLUMN_BLOCK_SIZE; // COLUMN_BLOCK_SIZE == 2
212 : const size_t bytes_per_row_block_span = column_blocks_count * ROW_BLOCK_SIZE;
213 :
214 2940 : for (size_t row_id = 0; row_id < N; row_id += q4_0x_block_size) {
215 2916 : const size_t row_block_id = row_id / ROW_BLOCK_SIZE;
216 2916 : size_t i_in_block = row_id % ROW_BLOCK_SIZE;
217 221998 : for (int64_t column_idx = 0; column_idx < K; column_idx += QK4_0) {
218 1971738 : for (size_t i = 0; i < q4_0x_block_size; i++) {
219 1752656 : int row_idx = row_id + i;
220 : // Address the bytes for this row
221 1752656 : const size_t row_block_base =
222 1752656 : row_block_id * bytes_per_row_block_span + i_in_block + i;
223 1752656 : int index0 = row_block_base + (column_idx / 2) * ROW_BLOCK_SIZE;
224 :
225 29795152 : for (size_t column_block_id = 0; column_block_id < 16;
226 : ++column_block_id) {
227 28042496 : int4_weight[column_block_id] =
228 28042496 : osv32_weights[index0 + column_block_id * ROW_BLOCK_SIZE];
229 : }
230 1752656 : scale = osv32_scales[row_idx +
231 1752656 : (column_idx / scale_group_size) * rows_count_pad];
232 :
233 1752656 : create_q4_0_weights(int4_weight, dst_tmp[i].qs);
234 1752656 : dst_tmp[i].d = scale;
235 : }
236 : // Repack Q4_0 data
237 219082 : if (q4_0x_block_size == 4) {
238 0 : nntr_make_block_q4_0x4(dst_tmp, (block_q4_0x4 *)dst_);
239 : } else {
240 219082 : nntr_make_block_q4_0x8(dst_tmp, (block_q4_0x8 *)dst_);
241 : }
242 219082 : dst_ += q4_0x_block_size * sizeof(block_q4_0);
243 : }
244 : }
245 24 : }
246 :
247 0 : void Q4_0Utils::printBlockQ4_0(const block_q4_0 *block) {
248 : printf("Q4_0: ");
249 0 : for (int i = 0; i < 16; i++) {
250 0 : printf("%i %i ", block->qs[i] & 0x0F, (block->qs[i] >> 4) & 0x0F);
251 : }
252 0 : printf("| scale:%f\n", compute_fp16_to_fp32(block->d));
253 0 : }
254 :
255 : } // namespace nntrainer
|