LCOV - code coverage report
Current view: top level - nntrainer/tensor - q4_0_utils.cpp (source / functions) Coverage Total Hit
Test: coverage_filtered.info Lines: 51.8 % 110 57
Test Date: 2025-12-14 20:38:17 Functions: 40.0 % 10 4

            Line data    Source code
       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * @file        q4_0_utils.cpp
       4              :  * @date        15 October 2025
       5              :  * @brief       This is Q4_0Utils class for utils for Q4_0 quantization format.
       6              :  * @see         https://github.com/nnstreamer/nntrainer
       7              :  * @author      Maciej Nalewaj <m.nalewaj@samsung.com>
       8              :  * @bug         No known bugs
       9              :  */
      10              : 
      11              : #include <cassert>
      12              : #include <cmath>
      13              : 
      14              : #include "cpu_backend.h"
      15              : #include "fp16.h"
      16              : #include "int4_utils.h"
      17              : #include "nntrainer_error.h"
      18              : #include "q4_0_utils.h"
      19              : #include "util_func.h"
      20              : 
      21              : #ifdef __AVX2__
      22              : #include <immintrin.h>
      23              : #endif
      24              : 
      25              : namespace nntrainer {
      26              : 
      27            0 : void Q4_0Utils::unpackOneBlockQ4_0x4(const block_q4_0x4 *in, block_q4_0 *dst) {
      28              :   unsigned int blck_size_interleave = 8;
      29              : 
      30            0 :   for (int i = 0; i < 4; i++) {
      31            0 :     dst[i].d = in->d[i];
      32              :   }
      33              : 
      34              :   const int end = QK4_0 * 2 / blck_size_interleave;
      35              :   const uint64_t xor_mask = 0x8888888888888888ULL;
      36              : 
      37            0 :   for (int i = 0; i < end; ++i) {
      38            0 :     int dst_id = i % 4;
      39            0 :     int dst_offset = (i / 4) * blck_size_interleave;
      40            0 :     int src_offset = i * blck_size_interleave;
      41              : 
      42              :     uint64_t elems;
      43            0 :     memcpy(&elems, &in->qs[src_offset], sizeof(uint64_t));
      44            0 :     elems ^= xor_mask;
      45            0 :     memcpy(&dst[dst_id].qs[dst_offset], &elems, sizeof(uint64_t));
      46              :   }
      47            0 : }
      48              : 
      49            0 : void Q4_0Utils::unpackBlocksQ4_0x4(const block_q4_0x4 *__restrict src,
      50              :                                    size_t data_size, size_t nrow, size_t K,
      51              :                                    block_q4_0 *__restrict dst) {
      52              :   int interleave_block = 4;
      53              : 
      54              :   const block_q4_0x4 *src_ = src;
      55              :   block_q4_0 *dst_ = (block_q4_0 *)dst;
      56              :   block_q4_0 dst_tmp[4];
      57            0 :   int nblocks = K / QK4_0;
      58              : 
      59            0 :   assert(data_size == (nrow / 4) * nblocks * sizeof(block_q4_0x4));
      60              : 
      61            0 :   for (size_t b = 0; b < nrow; b += interleave_block) {
      62            0 :     for (int64_t x = 0; x < nblocks; x++) {
      63            0 :       unpackOneBlockQ4_0x4(src_++, dst_tmp);
      64              : 
      65            0 :       for (size_t i = 0; i < interleave_block; i++) {
      66            0 :         dst_[x + i * nblocks] = dst_tmp[i];
      67              :       }
      68              :     }
      69            0 :     dst_ += interleave_block * nblocks;
      70              :   }
      71            0 : }
      72              : 
      73            0 : void Q4_0Utils::dequantizeQ4_0x4(const void *q4_weight_repacked, int N, int K,
      74              :                                  float *dequantized_weights) {
      75            0 :   assert(K % QK4_0 == 0);
      76            0 :   assert(N % 4 == 0);
      77            0 :   size_t data_size = (K / QK4_0) * (N / 4) * sizeof(block_q4_0x4);
      78            0 :   std::vector<uint8_t> q4_weight_out(data_size);
      79            0 :   unpackBlocksQ4_0x4((block_q4_0x4 *)q4_weight_repacked, data_size, N, K,
      80              :                      (block_q4_0 *)q4_weight_out.data());
      81              : 
      82            0 :   nntrainer::dequantize_row_q4_0((const void *)q4_weight_out.data(),
      83            0 :                                  dequantized_weights, K * N);
      84            0 : }
      85              : 
      86       438164 : void Q4_0Utils::unpackOneBlockQ4_0x8(const block_q4_0x8 *in, block_q4_0 *dst) {
      87              :   unsigned int blck_size_interleave = 8;
      88              : 
      89      3943476 :   for (int i = 0; i < 8; i++) {
      90      3505312 :     dst[i].d = in->d[i];
      91              :   }
      92              : 
      93              :   const int end = QK4_0 * 4 / blck_size_interleave;
      94              :   const uint64_t xor_mask = 0x8888888888888888ULL;
      95              : 
      96      7448788 :   for (int i = 0; i < end; ++i) {
      97      7010624 :     int dst_id = i % 8;
      98      7010624 :     int dst_offset = (i / 8) * blck_size_interleave;
      99      7010624 :     int src_offset = i * blck_size_interleave;
     100              : 
     101              :     uint64_t elems;
     102      7010624 :     memcpy(&elems, &in->qs[src_offset], sizeof(uint64_t));
     103      7010624 :     elems ^= xor_mask;
     104      7010624 :     memcpy(&dst[dst_id].qs[dst_offset], &elems, sizeof(uint64_t));
     105              :   }
     106       438164 : }
     107              : 
     108           48 : void Q4_0Utils::unpackBlocksQ4_0x8(const block_q4_0x8 *__restrict src,
     109              :                                    size_t data_size, size_t nrow, size_t K,
     110              :                                    block_q4_0 *__restrict dst) {
     111              :   int interleave_block = 8;
     112              : 
     113              :   const block_q4_0x8 *src_ = src;
     114              :   block_q4_0 *dst_ = (block_q4_0 *)dst;
     115              :   block_q4_0 dst_tmp[8];
     116           48 :   int nblocks = K / QK4_0;
     117              : 
     118           48 :   assert(data_size == (nrow / 8) * nblocks * sizeof(block_q4_0x8));
     119              : 
     120         5880 :   for (size_t b = 0; b < nrow; b += interleave_block) {
     121       443996 :     for (int64_t x = 0; x < nblocks; x++) {
     122       438164 :       unpackOneBlockQ4_0x8(src_++, dst_tmp);
     123              : 
     124      3943476 :       for (size_t i = 0; i < interleave_block; i++) {
     125      3505312 :         dst_[x + i * nblocks] = dst_tmp[i];
     126              :       }
     127              :     }
     128         5832 :     dst_ += interleave_block * nblocks;
     129              :   }
     130           48 : }
     131              : 
     132            0 : void Q4_0Utils::dequantizeQ4_0x8(const void *q4_weight_repacked, int N, int K,
     133              :                                  float *dequantized_weights) {
     134            0 :   assert(K % QK4_0 == 0);
     135            0 :   assert(N % 8 == 0);
     136            0 :   size_t data_size = (K / QK4_0) * (N / 8) * sizeof(block_q4_0x8);
     137            0 :   std::vector<uint8_t> q4_weight_out(data_size);
     138            0 :   unpackBlocksQ4_0x8((block_q4_0x8 *)q4_weight_repacked, data_size, N, K,
     139              :                      (block_q4_0 *)q4_weight_out.data());
     140              : 
     141            0 :   nntrainer::dequantize_row_q4_0((const void *)q4_weight_out.data(),
     142            0 :                                  dequantized_weights, K * N);
     143            0 : }
     144              : 
     145            0 : inline static void nntr_make_block_q4_0x4(const block_q4_0 *in,
     146              :                                           block_q4_0x4 *out) {
     147              :   constexpr size_t IN_CNT = 4;
     148              :   constexpr size_t HALF_SIZE = 8;
     149              : 
     150            0 :   for (int i = 0; i < IN_CNT; ++i) {
     151            0 :     out->d[i] = in[i].d;
     152              :   }
     153              : 
     154            0 :   for (int i = 0; i < IN_CNT; ++i) {
     155            0 :     memcpy(&out->qs[i * HALF_SIZE], &in[i].qs[0], HALF_SIZE);
     156              :   }
     157            0 :   for (int i = 0; i < IN_CNT; ++i) {
     158            0 :     memcpy(&out->qs[IN_CNT * HALF_SIZE + i * HALF_SIZE], &in[i].qs[8],
     159              :            HALF_SIZE);
     160              :   }
     161            0 : }
     162              : 
     163       219082 : inline static void nntr_make_block_q4_0x8(const block_q4_0 *in,
     164              :                                           block_q4_0x8 *out) {
     165              :   constexpr size_t IN_CNT = 8;
     166              :   constexpr size_t HALF_SIZE = 8;
     167              : 
     168      1971738 :   for (int i = 0; i < IN_CNT; ++i) {
     169      1752656 :     out->d[i] = in[i].d;
     170              :   }
     171              : 
     172      1971738 :   for (int i = 0; i < IN_CNT; ++i) {
     173      1752656 :     memcpy(&out->qs[i * HALF_SIZE], &in[i].qs[0], HALF_SIZE);
     174              :   }
     175      1971738 :   for (int i = 0; i < IN_CNT; ++i) {
     176      1752656 :     memcpy(&out->qs[IN_CNT * HALF_SIZE + i * HALF_SIZE], &in[i].qs[8],
     177              :            HALF_SIZE);
     178              :   }
     179       219082 : }
     180              : 
     181           24 : void Q4_0Utils::transformQ4_0x_FromInt4(size_t N, size_t K,
     182              :                                         const uint8_t *osv32_weights,
     183              :                                         const uint16_t *osv32_scales,
     184              :                                         size_t scale_group_size,
     185              :                                         int q4_0x_block_size, void *dst_q4_0x) {
     186              : 
     187           24 :   NNTR_THROW_IF((!(scale_group_size == 32 || scale_group_size == 64 ||
     188              :                    scale_group_size == 128)),
     189              :                 std::invalid_argument)
     190              :     << "Scale group size must be 32/64/128";
     191           24 :   NNTR_THROW_IF(K % QK4_0 != 0, std::invalid_argument)
     192              :     << "K size must be divisable by QK4_0 (32)";
     193           24 :   NNTR_THROW_IF(N % 8 != 0, std::invalid_argument)
     194              :     << "N size must be divisable by 8";
     195           24 :   NNTR_THROW_IF((!(q4_0x_block_size == 4 || q4_0x_block_size == 8)),
     196              :                 std::invalid_argument)
     197              :     << "q4_0x_block_size must be 4 or 8";
     198              : 
     199              :   static constexpr const size_t ROW_BLOCK_SIZE = 32;
     200              :   static constexpr const size_t COLUMN_BLOCK_SIZE = 2;
     201              : 
     202              :   uint8_t int4_weight[16];
     203              :   uint16_t scale;
     204              :   block_q4_0 dst_tmp[8];
     205              :   uint8_t *dst_ = reinterpret_cast<uint8_t *>(dst_q4_0x);
     206              : 
     207              :   // --- Layout ---
     208           24 :   const size_t rows_count_pad = align(N, ROW_BLOCK_SIZE);
     209           24 :   const size_t columns_count_pad = align(K, ROW_BLOCK_SIZE);
     210           24 :   const size_t column_blocks_count =
     211              :     columns_count_pad / COLUMN_BLOCK_SIZE; // COLUMN_BLOCK_SIZE == 2
     212              :   const size_t bytes_per_row_block_span = column_blocks_count * ROW_BLOCK_SIZE;
     213              : 
     214         2940 :   for (size_t row_id = 0; row_id < N; row_id += q4_0x_block_size) {
     215         2916 :     const size_t row_block_id = row_id / ROW_BLOCK_SIZE;
     216         2916 :     size_t i_in_block = row_id % ROW_BLOCK_SIZE;
     217       221998 :     for (int64_t column_idx = 0; column_idx < K; column_idx += QK4_0) {
     218      1971738 :       for (size_t i = 0; i < q4_0x_block_size; i++) {
     219      1752656 :         int row_idx = row_id + i;
     220              :         // Address the bytes for this row
     221      1752656 :         const size_t row_block_base =
     222      1752656 :           row_block_id * bytes_per_row_block_span + i_in_block + i;
     223      1752656 :         int index0 = row_block_base + (column_idx / 2) * ROW_BLOCK_SIZE;
     224              : 
     225     29795152 :         for (size_t column_block_id = 0; column_block_id < 16;
     226              :              ++column_block_id) {
     227     28042496 :           int4_weight[column_block_id] =
     228     28042496 :             osv32_weights[index0 + column_block_id * ROW_BLOCK_SIZE];
     229              :         }
     230      1752656 :         scale = osv32_scales[row_idx +
     231      1752656 :                              (column_idx / scale_group_size) * rows_count_pad];
     232              : 
     233      1752656 :         create_q4_0_weights(int4_weight, dst_tmp[i].qs);
     234      1752656 :         dst_tmp[i].d = scale;
     235              :       }
     236              :       // Repack Q4_0 data
     237       219082 :       if (q4_0x_block_size == 4) {
     238            0 :         nntr_make_block_q4_0x4(dst_tmp, (block_q4_0x4 *)dst_);
     239              :       } else {
     240       219082 :         nntr_make_block_q4_0x8(dst_tmp, (block_q4_0x8 *)dst_);
     241              :       }
     242       219082 :       dst_ += q4_0x_block_size * sizeof(block_q4_0);
     243              :     }
     244              :   }
     245           24 : }
     246              : 
     247            0 : void Q4_0Utils::printBlockQ4_0(const block_q4_0 *block) {
     248              :   printf("Q4_0: ");
     249            0 :   for (int i = 0; i < 16; i++) {
     250            0 :     printf("%i %i ", block->qs[i] & 0x0F, (block->qs[i] >> 4) & 0x0F);
     251              :   }
     252            0 :   printf("| scale:%f\n", compute_fp16_to_fp32(block->d));
     253            0 : }
     254              : 
     255              : } // namespace nntrainer
        

Generated by: LCOV version 2.0-1