Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * @file q4_0_utils.cpp
4 : * @date 15 October 2025
5 : * @brief This is Q4_0Utils class for utils for Q4_0 quantization format.
6 : * @see https://github.com/nntrainer/nntrainer
7 : * @author Maciej Nalewaj <m.nalewaj@samsung.com>
8 : * @bug No known bugs
9 : */
10 :
11 : #include <cassert>
12 : #include <cmath>
13 :
14 : #include "cpu_backend.h"
15 : #include "fp16.h"
16 : #include "int4_utils.h"
17 : #include "nntrainer_error.h"
18 : #include "q4_0_utils.h"
19 : #include "util_func.h"
20 :
21 : namespace nntrainer {
22 :
23 0 : void Q4_0Utils::unpackOneBlockQ4_0x4(const block_q4_0x4 *in, block_q4_0 *dst) {
24 : unsigned int blck_size_interleave = 8;
25 :
26 0 : for (int i = 0; i < 4; i++) {
27 0 : dst[i].d = in->d[i];
28 : }
29 :
30 : const int end = QK4_0 * 2 / blck_size_interleave;
31 : const uint64_t xor_mask = 0x8888888888888888ULL;
32 :
33 0 : for (int i = 0; i < end; ++i) {
34 0 : int dst_id = i % 4;
35 0 : int dst_offset = (i / 4) * blck_size_interleave;
36 0 : int src_offset = i * blck_size_interleave;
37 :
38 : uint64_t elems;
39 0 : memcpy(&elems, &in->qs[src_offset], sizeof(uint64_t));
40 0 : elems ^= xor_mask;
41 0 : memcpy(&dst[dst_id].qs[dst_offset], &elems, sizeof(uint64_t));
42 : }
43 0 : }
44 :
45 0 : void Q4_0Utils::unpackBlocksQ4_0x4(const block_q4_0x4 *__restrict src,
46 : size_t data_size, size_t nrow, size_t K,
47 : block_q4_0 *__restrict dst) {
48 : int interleave_block = 4;
49 :
50 : const block_q4_0x4 *src_ = src;
51 : block_q4_0 *dst_ = (block_q4_0 *)dst;
52 : block_q4_0 dst_tmp[4];
53 0 : int nblocks = K / QK4_0;
54 :
55 0 : assert(data_size == (nrow / 4) * nblocks * sizeof(block_q4_0x4));
56 :
57 0 : for (size_t b = 0; b < nrow; b += interleave_block) {
58 0 : for (int64_t x = 0; x < nblocks; x++) {
59 0 : unpackOneBlockQ4_0x4(src_++, dst_tmp);
60 :
61 0 : for (size_t i = 0; i < interleave_block; i++) {
62 0 : dst_[x + i * nblocks] = dst_tmp[i];
63 : }
64 : }
65 0 : dst_ += interleave_block * nblocks;
66 : }
67 0 : }
68 :
69 0 : void Q4_0Utils::dequantizeQ4_0x4(const void *q4_weight_repacked, int N, int K,
70 : float *dequantized_weights) {
71 0 : assert(K % QK4_0 == 0);
72 0 : assert(N % 4 == 0);
73 0 : size_t data_size = (K / QK4_0) * (N / 4) * sizeof(block_q4_0x4);
74 0 : std::vector<uint8_t> q4_weight_out(data_size);
75 0 : unpackBlocksQ4_0x4((block_q4_0x4 *)q4_weight_repacked, data_size, N, K,
76 : (block_q4_0 *)q4_weight_out.data());
77 :
78 0 : nntrainer::dequantize_row_q4_0((const void *)q4_weight_out.data(),
79 0 : dequantized_weights, K * N);
80 0 : }
81 :
82 438164 : void Q4_0Utils::unpackOneBlockQ4_0x8(const block_q4_0x8 *in, block_q4_0 *dst) {
83 : unsigned int blck_size_interleave = 8;
84 :
85 3943476 : for (int i = 0; i < 8; i++) {
86 3505312 : dst[i].d = in->d[i];
87 : }
88 :
89 : const int end = QK4_0 * 4 / blck_size_interleave;
90 : const uint64_t xor_mask = 0x8888888888888888ULL;
91 :
92 7448788 : for (int i = 0; i < end; ++i) {
93 7010624 : int dst_id = i % 8;
94 7010624 : int dst_offset = (i / 8) * blck_size_interleave;
95 7010624 : int src_offset = i * blck_size_interleave;
96 :
97 : uint64_t elems;
98 7010624 : memcpy(&elems, &in->qs[src_offset], sizeof(uint64_t));
99 7010624 : elems ^= xor_mask;
100 7010624 : memcpy(&dst[dst_id].qs[dst_offset], &elems, sizeof(uint64_t));
101 : }
102 438164 : }
103 :
104 48 : void Q4_0Utils::unpackBlocksQ4_0x8(const block_q4_0x8 *__restrict src,
105 : size_t data_size, size_t nrow, size_t K,
106 : block_q4_0 *__restrict dst) {
107 : int interleave_block = 8;
108 :
109 : const block_q4_0x8 *src_ = src;
110 : block_q4_0 *dst_ = (block_q4_0 *)dst;
111 : block_q4_0 dst_tmp[8];
112 48 : int nblocks = K / QK4_0;
113 :
114 48 : assert(data_size == (nrow / 8) * nblocks * sizeof(block_q4_0x8));
115 :
116 5880 : for (size_t b = 0; b < nrow; b += interleave_block) {
117 443996 : for (int64_t x = 0; x < nblocks; x++) {
118 438164 : unpackOneBlockQ4_0x8(src_++, dst_tmp);
119 :
120 3943476 : for (size_t i = 0; i < interleave_block; i++) {
121 3505312 : dst_[x + i * nblocks] = dst_tmp[i];
122 : }
123 : }
124 5832 : dst_ += interleave_block * nblocks;
125 : }
126 48 : }
127 :
128 0 : void Q4_0Utils::dequantizeQ4_0x8(const void *q4_weight_repacked, int N, int K,
129 : float *dequantized_weights) {
130 0 : assert(K % QK4_0 == 0);
131 0 : assert(N % 8 == 0);
132 0 : size_t data_size = (K / QK4_0) * (N / 8) * sizeof(block_q4_0x8);
133 0 : std::vector<uint8_t> q4_weight_out(data_size);
134 0 : unpackBlocksQ4_0x8((block_q4_0x8 *)q4_weight_repacked, data_size, N, K,
135 : (block_q4_0 *)q4_weight_out.data());
136 :
137 0 : nntrainer::dequantize_row_q4_0((const void *)q4_weight_out.data(),
138 0 : dequantized_weights, K * N);
139 0 : }
140 :
141 0 : inline static void nntr_make_block_q4_0x4(const block_q4_0 *in,
142 : block_q4_0x4 *out) {
143 : constexpr size_t IN_CNT = 4;
144 : constexpr size_t HALF_SIZE = 8;
145 :
146 0 : for (int i = 0; i < IN_CNT; ++i) {
147 0 : out->d[i] = in[i].d;
148 : }
149 :
150 0 : for (int i = 0; i < IN_CNT; ++i) {
151 0 : memcpy(&out->qs[i * HALF_SIZE], &in[i].qs[0], HALF_SIZE);
152 : }
153 0 : for (int i = 0; i < IN_CNT; ++i) {
154 0 : memcpy(&out->qs[IN_CNT * HALF_SIZE + i * HALF_SIZE], &in[i].qs[8],
155 : HALF_SIZE);
156 : }
157 0 : }
158 :
159 0 : inline static void nntr_make_block_q4_0x8(const block_q4_0 *in,
160 : block_q4_0x8 *out) {
161 : constexpr size_t IN_CNT = 8;
162 : constexpr size_t HALF_SIZE = 8;
163 :
164 0 : for (int i = 0; i < IN_CNT; ++i) {
165 0 : out->d[i] = in[i].d;
166 : }
167 :
168 0 : for (int i = 0; i < IN_CNT; ++i) {
169 0 : memcpy(&out->qs[i * HALF_SIZE], &in[i].qs[0], HALF_SIZE);
170 : }
171 0 : for (int i = 0; i < IN_CNT; ++i) {
172 0 : memcpy(&out->qs[IN_CNT * HALF_SIZE + i * HALF_SIZE], &in[i].qs[8],
173 : HALF_SIZE);
174 : }
175 0 : }
176 :
177 0 : void Q4_0Utils::transformQ4_0x_FromInt4(size_t N, size_t K,
178 : const uint8_t *osv32_weights,
179 : const uint16_t *osv32_scales,
180 : size_t scale_group_size,
181 : int q4_0x_block_size, void *dst_q4_0x) {
182 :
183 0 : NNTR_THROW_IF((!(scale_group_size == 32 || scale_group_size == 64 ||
184 : scale_group_size == 128)),
185 : std::invalid_argument)
186 : << "Scale group size must be 32/64/128";
187 0 : NNTR_THROW_IF(K % QK4_0 != 0, std::invalid_argument)
188 : << "K size must be divisable by QK4_0 (32)";
189 0 : NNTR_THROW_IF(N % 8 != 0, std::invalid_argument)
190 : << "N size must be divisable by 8";
191 0 : NNTR_THROW_IF((!(q4_0x_block_size == 4 || q4_0x_block_size == 8)),
192 : std::invalid_argument)
193 : << "q4_0x_block_size must be 4 or 8";
194 :
195 : static constexpr const size_t ROW_BLOCK_SIZE = 32;
196 : static constexpr const size_t COLUMN_BLOCK_SIZE = 2;
197 :
198 : uint8_t int4_weight[16];
199 : uint16_t scale;
200 : block_q4_0 dst_tmp[8];
201 : uint8_t *dst_ = reinterpret_cast<uint8_t *>(dst_q4_0x);
202 :
203 : // --- Layout ---
204 0 : const size_t rows_count_pad = align(N, ROW_BLOCK_SIZE);
205 0 : const size_t columns_count_pad = align(K, ROW_BLOCK_SIZE);
206 0 : const size_t column_blocks_count =
207 : columns_count_pad / COLUMN_BLOCK_SIZE; // COLUMN_BLOCK_SIZE == 2
208 : const size_t bytes_per_row_block_span = column_blocks_count * ROW_BLOCK_SIZE;
209 :
210 0 : for (size_t row_id = 0; row_id < N; row_id += q4_0x_block_size) {
211 0 : const size_t row_block_id = row_id / ROW_BLOCK_SIZE;
212 0 : size_t i_in_block = row_id % ROW_BLOCK_SIZE;
213 0 : for (int64_t column_idx = 0; column_idx < K; column_idx += QK4_0) {
214 0 : for (size_t i = 0; i < q4_0x_block_size; i++) {
215 0 : int row_idx = row_id + i;
216 : // Address the bytes for this row
217 0 : const size_t row_block_base =
218 0 : row_block_id * bytes_per_row_block_span + i_in_block + i;
219 0 : int index0 = row_block_base + (column_idx / 2) * ROW_BLOCK_SIZE;
220 :
221 0 : for (size_t column_block_id = 0; column_block_id < 16;
222 : ++column_block_id) {
223 0 : int4_weight[column_block_id] =
224 0 : osv32_weights[index0 + column_block_id * ROW_BLOCK_SIZE];
225 : }
226 0 : scale = osv32_scales[row_idx +
227 0 : (column_idx / scale_group_size) * rows_count_pad];
228 :
229 0 : create_q4_0_weights(int4_weight, dst_tmp[i].qs);
230 0 : dst_tmp[i].d = scale;
231 : }
232 : // Repack Q4_0 data
233 0 : if (q4_0x_block_size == 4) {
234 0 : nntr_make_block_q4_0x4(dst_tmp, (block_q4_0x4 *)dst_);
235 : } else {
236 0 : nntr_make_block_q4_0x8(dst_tmp, (block_q4_0x8 *)dst_);
237 : }
238 0 : dst_ += q4_0x_block_size * sizeof(block_q4_0);
239 : }
240 : }
241 0 : }
242 :
243 0 : void Q4_0Utils::printBlockQ4_0(const block_q4_0 *block) {
244 : printf("Q4_0: ");
245 0 : for (int i = 0; i < 16; i++) {
246 0 : printf("%i %i ", block->qs[i] & 0x0F, (block->qs[i] >> 4) & 0x0F);
247 : }
248 0 : printf("| scale:%f\n", compute_fp16_to_fp32(block->d));
249 0 : }
250 :
251 : } // namespace nntrainer
|