Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * @file char_tensor.cpp
4 : * @date 02 April 2024
5 : * @brief This is CharTensor class for 8-bit integer calculation
6 : * @see https://github.com/nnstreamer/nntrainer
7 : * @author Donghyeon Jeong <dhyeon.jeong@samsung.com>
8 : * @bug No known bugs except for NYI items
9 : */
10 :
11 : #include <iomanip>
12 : #include <iostream>
13 :
14 : #include <char_tensor.h>
15 : #include <cpu_backend.h>
16 : #include <tensor.h>
17 :
18 : namespace nntrainer {
19 :
20 2 : CharTensor::CharTensor(std::string name_, Tformat fm, QScheme qscheme_) :
21 4 : TensorBase(name_, fm, Tdatatype::QINT8), qscheme(qscheme_) {}
22 :
23 58 : CharTensor::CharTensor(const TensorDim &d, bool alloc_now, Initializer init,
24 58 : std::string name, QScheme qscheme_) :
25 58 : TensorBase(d, alloc_now, init, name), qscheme(qscheme_) {
26 58 : if (alloc_now)
27 54 : allocate();
28 58 : }
29 :
30 32 : CharTensor::CharTensor(const TensorDim &d, const void *buf, QScheme qscheme_) :
31 32 : CharTensor(d, true, Initializer::NONE, "", qscheme_) {
32 32 : if (d.getDataLen() != 0) {
33 32 : if (buf != nullptr)
34 8 : copy(buf);
35 : }
36 32 : }
37 :
38 2 : CharTensor::CharTensor(
39 : std::vector<std::vector<std::vector<std::vector<int8_t>>>> const &d,
40 4 : std::vector<float> const &scales, Tformat fm, QScheme qscheme_) {
41 2 : if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
42 : throw std::out_of_range(
43 1 : "[Tensor] trying to initialize CharTensor from empty vector");
44 : }
45 :
46 1 : dim.setTensorDim(0, d.size());
47 1 : if (fm == Tformat::NCHW) {
48 1 : dim.setTensorDim(1, d[0].size());
49 1 : dim.setTensorDim(2, d[0][0].size());
50 1 : dim.setTensorDim(3, d[0][0][0].size());
51 : } else {
52 0 : dim.setTensorDim(2, d[0].size());
53 0 : dim.setTensorDim(3, d[0][0].size());
54 0 : dim.setTensorDim(1, d[0][0][0].size());
55 : }
56 :
57 : dim.setTensorType({fm, Tdatatype::QINT8});
58 :
59 1 : strides = dim.computeStrides();
60 1 : contiguous = true;
61 1 : initializer = Initializer::NONE;
62 1 : qscheme = qscheme_;
63 :
64 1 : NNTR_THROW_IF(scales.size() != scale_size(), std::invalid_argument)
65 : << "invalid scale factor size " << scales.size();
66 :
67 : MemoryData *mem_data = new MemoryData(
68 131 : (void *)(new int8_t[dim.getDataLen() + sizeof(float) * scale_size()]()));
69 2 : data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
70 1 : delete[] mem_data->getAddr<int8_t>();
71 1 : delete mem_data;
72 : });
73 :
74 1 : offset = 0;
75 :
76 : // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
77 : // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
78 : // dim[1] == height, dim[2] == width, dim[3] == channel
79 1 : if (fm == Tformat::NCHW) {
80 2 : for (unsigned int i = 0; i < batch(); ++i)
81 4 : for (unsigned int j = 0; j < channel(); ++j)
82 12 : for (unsigned int k = 0; k < height(); ++k)
83 99 : for (unsigned int l = 0; l < width(); ++l)
84 90 : this->setValue(i, j, k, l, d[i][j][k][l]);
85 : } else {
86 0 : for (unsigned int i = 0; i < batch(); ++i)
87 0 : for (unsigned int j = 0; j < height(); ++j)
88 0 : for (unsigned int k = 0; k < width(); ++k)
89 0 : for (unsigned int l = 0; l < channel(); ++l)
90 0 : this->setValue(i, l, j, k, d[i][j][k][l]);
91 : }
92 :
93 : // copy scale factors
94 1 : scopy(scale_size(), scales.data(), 1, (float *)getScale(), 1);
95 2 : }
96 :
97 10 : bool CharTensor::operator==(const CharTensor &rhs) const {
98 10 : if (qscheme != rhs.qscheme)
99 : return false;
100 :
101 : // compare quantized data
102 10 : const int8_t *_data = (int8_t *)getData();
103 10 : const int8_t *_rdata = (int8_t *)rhs.getData();
104 741 : for (size_t i = 0; i < size(); ++i) {
105 734 : if (_data[i] != _rdata[i])
106 : return false;
107 : }
108 :
109 : // compare scale factors
110 7 : const float *_scales = (float *)getScale();
111 7 : const float *_rscales = (float *)rhs.getScale();
112 14 : for (size_t i = 0; i < scale_size(); ++i) {
113 7 : if (std::fabs(_scales[i] - _rscales[i]) > 1e-5)
114 : return false;
115 : }
116 :
117 : return true;
118 : }
119 :
120 57 : void CharTensor::allocate() {
121 57 : if (empty() || data)
122 : return;
123 :
124 56 : if (src_tensor) {
125 : /// allocate data based on the source tensor
126 2 : allocateSrcTensor();
127 : /** as this memory is shared, do NOT initialize */
128 : } else {
129 : /// allocate new memory for the tensor data
130 : MemoryData *mem_data;
131 :
132 : mem_data = new MemoryData(
133 607844 : (void *)(new int8_t[dim.getDataLen() + 4 * scale_size()]{}));
134 54 : data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
135 54 : delete[] mem_data->template getAddr<int8_t>();
136 54 : delete mem_data;
137 : });
138 :
139 54 : offset = 0;
140 54 : initialize();
141 : }
142 : }
143 :
144 1 : void CharTensor::deallocate() {
145 : data = nullptr;
146 1 : offset = 0;
147 1 : }
148 :
149 6597 : void *CharTensor::getData() const {
150 6597 : if (!data)
151 : return nullptr;
152 :
153 : data->validate();
154 6597 : return data->getAddr<int8_t>() + offset;
155 : }
156 :
157 1 : void *CharTensor::getData(size_t idx) const {
158 1 : if (!data)
159 : return nullptr;
160 :
161 : data->validate();
162 1 : return data->getAddr<int8_t>() + offset + idx;
163 : }
164 :
165 40 : void *CharTensor::getScale() const {
166 40 : if (!data)
167 : return nullptr;
168 :
169 : data->validate();
170 40 : return ((int8_t *)getData()) + size();
171 : }
172 :
173 0 : void *CharTensor::getScale(size_t idx) const {
174 0 : NNTR_THROW_IF(idx > scale_size(), std::invalid_argument)
175 : << "Tensor::getScale() index is not valid";
176 :
177 0 : if (!data)
178 : return nullptr;
179 :
180 : data->validate();
181 0 : return ((float *)getScale()) + idx;
182 : }
183 :
184 0 : void *CharTensor::getAddress(unsigned int i) {
185 0 : size_t index = getIndex(batch(), channel(), height(), width());
186 0 : if (i > index) {
187 : return nullptr;
188 : }
189 0 : return &((int8_t *)getData())[i];
190 : }
191 :
192 0 : const void *CharTensor::getAddress(unsigned int i) const {
193 0 : size_t index = getIndex(batch(), channel(), height(), width());
194 0 : if (i > index) {
195 : return nullptr;
196 : }
197 0 : return &((int8_t *)getData())[i];
198 : }
199 :
200 44 : const int8_t &CharTensor::getValue(unsigned int i) const {
201 44 : return ((int8_t *)getData())[i];
202 : }
203 :
204 29 : int8_t &CharTensor::getValue(unsigned int i) {
205 29 : return ((int8_t *)getData())[i];
206 : }
207 :
208 44 : const int8_t &CharTensor::getValue(unsigned int b, unsigned int c,
209 : unsigned int h, unsigned int w) const {
210 44 : return getValue(getIndex(b, c, h, w));
211 : }
212 :
213 5 : int8_t &CharTensor::getValue(unsigned int b, unsigned int c, unsigned int h,
214 : unsigned int w) {
215 5 : return getValue(getIndex(b, c, h, w));
216 : }
217 :
218 13 : void CharTensor::setValue(float value) {
219 13 : int8_t *data = (int8_t *)getData();
220 13 : std::fill(data, data + size(), static_cast<int8_t>(value));
221 13 : }
222 :
223 6 : void CharTensor::addValue(unsigned int b, unsigned int c, unsigned int h,
224 : unsigned int w, float value, float beta) {
225 6 : auto const &idx = getIndex(b, c, h, w);
226 6 : float output = ((int8_t *)getData())[idx];
227 6 : output *= beta;
228 6 : output += value;
229 :
230 6 : ((int8_t *)getData())[idx] = static_cast<int8_t>(std::trunc(output));
231 6 : }
232 :
233 6124 : void CharTensor::setValue(unsigned int b, unsigned int c, unsigned int h,
234 : unsigned int w, float value) {
235 6124 : ((int8_t *)getData())[getIndex(b, c, h, w)] = static_cast<int8_t>(value);
236 6124 : }
237 :
238 6 : void CharTensor::setZero() {
239 : /// @todo replace with apply_i or scal
240 6 : setValue(0);
241 6 : }
242 :
243 63 : void CharTensor::initialize() {
244 63 : if (empty() || !isAllocated())
245 : return;
246 :
247 : /// @note Sampling from the normal/uniform distribution is invalid
248 63 : switch (initializer) {
249 5 : case Initializer::ZEROS:
250 5 : setZero();
251 5 : break;
252 4 : case Initializer::ONES:
253 4 : setValue(1.0f);
254 4 : break;
255 : case Initializer::NONE:
256 : break;
257 1 : default:
258 1 : throw std::invalid_argument("Initializer not valid for " +
259 3 : getStringDataType());
260 : break;
261 : }
262 :
263 62 : putData();
264 : }
265 :
266 2 : void CharTensor::initialize(Initializer init) {
267 2 : initializer = init;
268 2 : initialize();
269 1 : }
270 :
271 0 : int CharTensor::multiply_i(float const &value) {
272 : // multiply value to scale factors
273 0 : float *g_scale = (float *)getScale();
274 :
275 0 : sscal(scale_size(), value, g_scale, 1);
276 0 : return ML_ERROR_NONE;
277 : }
278 :
279 1 : Tensor &CharTensor::multiply(Tensor const &input, Tensor &output,
280 : const float scale) const {
281 1 : CREATE_IF_EMPTY_DIMS(output, dim, nullptr, q_scheme());
282 :
283 1 : NNTR_THROW_IF(q_scheme() != input.q_scheme(), std::invalid_argument)
284 : << "[Tensor] Cannot multiply tensors with different quantization schemes.";
285 :
286 : /// @note remove after vector scale multiply is implemented
287 1 : NNTR_THROW_IF(q_scheme() != QScheme::PER_TENSOR_AFFINE, std::invalid_argument)
288 : << "Multiplication other than per tensor affine quantization scheme is "
289 : "NYI.";
290 :
291 1 : float lhs_scale = *(float *)getScale();
292 1 : float rhs_scale = *input.getScale<float>();
293 :
294 : /// @note current impl assumes pre-established quantization parameters are set
295 : /// @todo 1. verify result_scale is valid 2. calculate qparams if not given
296 0 : NNTR_THROW_IF(std::fpclassify(lhs_scale) == FP_ZERO ||
297 : std::fpclassify(rhs_scale) == FP_ZERO ||
298 : std::fpclassify(scale) == FP_ZERO,
299 : std::invalid_argument)
300 : << "scale factors not set, cannot multiply";
301 :
302 1 : float multiplier = lhs_scale * rhs_scale / scale;
303 :
304 1 : int8_t *lhs = (int8_t *)getData();
305 : int8_t *rhs = input.getData<int8_t>();
306 : int8_t *result = output.getData<int8_t>();
307 :
308 17 : for (unsigned int i = 0; i < size(); ++i) {
309 16 : int32_t accum_val =
310 16 : static_cast<int32_t>(lhs[i]) * static_cast<int32_t>(rhs[i]);
311 :
312 16 : result[i] =
313 32 : std::max(-128, std::min((int)std::lround(multiplier * accum_val), 127));
314 : }
315 :
316 1 : *output.getScale<float>() = scale;
317 :
318 1 : return output;
319 : }
320 :
321 1 : Tensor &CharTensor::add(Tensor const &input, Tensor &output,
322 : float const scale) const {
323 1 : CREATE_IF_EMPTY_DIMS(output, dim, nullptr, qscheme);
324 :
325 1 : NNTR_THROW_IF(q_scheme() != input.q_scheme(), std::invalid_argument)
326 : << "[Tensor] Cannot multiply tensors with different quantization schemes.";
327 :
328 : /// @note remove after vector scale multiply is implemented
329 1 : NNTR_THROW_IF(q_scheme() != QScheme::PER_TENSOR_AFFINE, std::invalid_argument)
330 : << "Tensor addition other than per tensor affine quantization scheme is "
331 : "NYI.";
332 :
333 1 : float lhs_scale = *(float *)getScale();
334 1 : float rhs_scale = *input.getScale<float>();
335 :
336 : /// @note current impl assumes pre-established quantization parameters are set
337 : /// @todo 1. verify result_scale is valid 2. calculate qparams if not given
338 : /// 3. check qscheme is per tensor affine
339 0 : NNTR_THROW_IF(std::fpclassify(lhs_scale) == FP_ZERO ||
340 : std::fpclassify(rhs_scale) == FP_ZERO ||
341 : std::fpclassify(scale) == FP_ZERO,
342 : std::invalid_argument)
343 : << "scale factors not set, cannot multiply";
344 :
345 : /// @todo check whether the following method has faster execution speed.
346 : /// 1. clone input A and B to A_fp32 and B_fp32
347 : /// 2. dequantize A_fp32 and B_fp32
348 : /// 3. perform addition: A_fp32.add(B_fp32, output_fp32)
349 : /// 4. quantize output_fp32
350 2 : for (unsigned int b = 0; b < batch(); ++b) {
351 2 : for (unsigned int c = 0; c < channel(); ++c) {
352 5 : for (unsigned int h = 0; h < height(); ++h) {
353 20 : for (unsigned int w = 0; w < width(); ++w) {
354 16 : float val = getValue(b, c, h, w) * lhs_scale +
355 16 : input.getValue<int8_t>(b, c, h, w) * rhs_scale;
356 :
357 16 : output.setValue(
358 : b, c, h, w,
359 : static_cast<int8_t>(
360 32 : std::max(-128, std::min((int)std::lround(val / scale), 127))));
361 : }
362 : }
363 : }
364 : }
365 1 : *output.getScale<float>() = scale;
366 :
367 1 : return output;
368 : }
369 :
370 1 : void CharTensor::copy(const Tensor &from) {
371 1 : reshape(from.getDim());
372 1 : copy(from.getData());
373 1 : }
374 :
375 2 : void CharTensor::copyData(const Tensor &from) {
376 2 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
377 : << getName() << " is not contiguous, cannot copy.";
378 :
379 2 : NNTR_THROW_IF(size() != from.size(), std::invalid_argument)
380 : << "Size of tensor to copy must match";
381 :
382 : /// @todo support copy from float32 & float16 to int8 data
383 : /// @note this could require scale factor
384 2 : switch (from.getDataType()) {
385 : case ml::train::TensorDim::DataType::QINT8:
386 0 : copy(from.getData());
387 0 : break;
388 2 : case ml::train::TensorDim::DataType::FP32:
389 4 : copy_fp32(from.size(), from.getData<float>(), (int8_t *)getData());
390 2 : break;
391 0 : default:
392 0 : throw std::invalid_argument("Error: Unsupported data type");
393 : break;
394 : }
395 2 : }
396 :
397 1 : void CharTensor::copy_with_stride(const Tensor &input, Tensor &output) {
398 4 : for (unsigned int b = 0; b < output.batch(); ++b) {
399 6 : for (unsigned int c = 0; c < output.channel(); ++c) {
400 12 : for (unsigned int h = 0; h < output.height(); ++h) {
401 54 : for (unsigned int w = 0; w < output.width(); ++w) {
402 45 : output.setValue(b, c, h, w, input.getValue<int8_t>(b, c, h, w));
403 : }
404 : }
405 : }
406 : }
407 1 : }
408 :
409 0 : void CharTensor::save(std::ostream &file) {
410 : /// @note Save quantization information
411 0 : save_quantization_info(file);
412 :
413 0 : std::streamsize sz = static_cast<std::streamsize>(getMemoryBytes());
414 :
415 0 : NNTR_THROW_IF(sz < 0, std::invalid_argument)
416 0 : << "save size: " << getMemoryBytes()
417 : << " is too big. It cannot be represented by std::streamsize";
418 :
419 0 : checkedWrite(file, (char *)getData(), sz,
420 : "[CharTensor::save] operation failed");
421 0 : putData();
422 0 : }
423 :
424 0 : void CharTensor::read(std::ifstream &file, size_t start_offset,
425 : bool read_from_offset) {
426 0 : if (start_offset == std::numeric_limits<size_t>::max()) {
427 0 : start_offset = file_offset;
428 : }
429 0 : read_quantization_info(file, start_offset, read_from_offset);
430 :
431 0 : std::streamsize sz = static_cast<std::streamsize>(getMemoryBytes());
432 :
433 0 : NNTR_THROW_IF(sz < 0, std::invalid_argument)
434 0 : << "read size: " << getMemoryBytes()
435 : << " is too big. It cannot be represented by std::streamsize";
436 :
437 0 : if (read_from_offset) {
438 0 : start_offset += sizeof(uint16_t);
439 : }
440 :
441 0 : checkedRead(file, (char *)getData(), sz,
442 : "[CharTensor::read] operation failed", start_offset,
443 : read_from_offset);
444 0 : putData();
445 0 : }
446 :
447 2 : std::vector<unsigned int> CharTensor::argmax() const {
448 : std::vector<unsigned int> result;
449 2 : const int8_t *data = (int8_t *)getData();
450 : size_t batch_size = batch();
451 2 : size_t feature_len = dim.getFeatureLen();
452 :
453 2 : result.resize(batch_size);
454 :
455 7 : for (unsigned int b = 0; b < batch_size; b++) {
456 : auto max_iter =
457 5 : std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
458 5 : result[b] = std::distance(data, max_iter) - (b * feature_len);
459 : }
460 2 : return result;
461 0 : }
462 :
463 1 : std::vector<unsigned int> CharTensor::argmin() const {
464 : std::vector<unsigned int> result;
465 1 : const int8_t *data = (int8_t *)getData();
466 : size_t batch_size = batch();
467 1 : size_t feature_len = dim.getFeatureLen();
468 :
469 1 : result.resize(batch_size);
470 :
471 3 : for (unsigned int b = 0; b < batch_size; b++) {
472 : auto min_iter =
473 2 : std::min_element(data + b * feature_len, data + (b + 1) * feature_len);
474 2 : result[b] = std::distance(data, min_iter) - (b * feature_len);
475 : }
476 1 : return result;
477 0 : }
478 :
479 3 : float CharTensor::max_abs() const {
480 3 : const int8_t *data = (int8_t *)getData();
481 : unsigned int idx;
482 :
483 3 : int8_t max_val = data[0];
484 5278 : for (unsigned int i = 1; i < size(); i += 1) {
485 5275 : int8_t cur_val = (data[i] >= 0) ? data[i] : -1 * data[i];
486 5275 : if (cur_val > max_val) {
487 : max_val = cur_val;
488 : }
489 : }
490 :
491 3 : return max_val;
492 : }
493 :
494 1 : float CharTensor::maxValue() const {
495 1 : const int8_t *data = (int8_t *)getData();
496 1 : return *std::max_element(data, data + size());
497 : }
498 :
499 3 : float CharTensor::minValue() const {
500 3 : const int8_t *data = (int8_t *)getData();
501 3 : return *std::min_element(data, data + size());
502 : }
503 :
504 3 : void CharTensor::print(std::ostream &out) const {
505 3 : const int8_t *data = (int8_t *)getData();
506 3 : unsigned int len = size();
507 3 : out << "data addr: " << reinterpret_cast<const float *>(data) << '\n';
508 3 : out << dim;
509 :
510 3 : if (len > 100) {
511 1 : out << '[' << (int)data[0] << ' ' << (int)data[1] << ' ' << (int)data[2]
512 2 : << " ... " << (int)data[len - 3] << ' ' << (int)data[len - 2] << ' '
513 1 : << (int)data[len - 1] << ']' << std::endl;
514 1 : return;
515 : }
516 :
517 2 : std::ios init(NULL);
518 2 : init.copyfmt(out);
519 2 : if (getFormat() == Tformat::NCHW) {
520 7 : for (unsigned int k = 0; k < batch(); k++) {
521 10 : for (unsigned int l = 0; l < channel(); l++) {
522 19 : for (unsigned int i = 0; i < height(); i++) {
523 42 : for (unsigned int j = 0; j < width(); j++) {
524 28 : out << std::setw(10) << (int)this->getValue(k, l, i, j) << " ";
525 : }
526 : out << std::endl;
527 : }
528 : out << std::endl;
529 : }
530 : out << "-------" << std::endl;
531 : }
532 : } else {
533 0 : for (unsigned int k = 0; k < batch(); k++) {
534 0 : for (unsigned int i = 0; i < height(); i++) {
535 0 : for (unsigned int j = 0; j < width(); j++) {
536 0 : for (unsigned int l = 0; l < channel(); l++) {
537 0 : out << std::setw(10) << (int)this->getValue(k, l, i, j) << " ";
538 : }
539 : out << std::endl;
540 : }
541 : out << std::endl;
542 : }
543 : out << "-------" << std::endl;
544 : }
545 0 : out.copyfmt(init);
546 : }
547 :
548 : /// print quantization information
549 2 : const float *q_scales = (float *)getScale();
550 :
551 2 : if (scale_size() > 50) {
552 0 : out << "Scale factors: [" << q_scales[0] << ' ' << q_scales[1] << ' '
553 0 : << q_scales[2] << " ... " << q_scales[len - 3] << ' '
554 0 : << q_scales[len - 2] << ' ' << q_scales[len - 1] << ']' << std::endl;
555 : return;
556 : }
557 :
558 2 : out << "Scale factors: ";
559 4 : for (unsigned i = 0; i < scale_size(); ++i) {
560 2 : out << q_scales[i] << " ";
561 : }
562 : out << std::endl;
563 : }
564 :
565 9 : size_t CharTensor::getMemoryBytes() const {
566 9 : return bytes() + scale_size() * sizeof(float);
567 : }
568 :
569 101 : size_t CharTensor::scale_size() const {
570 101 : switch (qscheme) {
571 : case QScheme::PER_TENSOR_AFFINE:
572 : return 1;
573 : break;
574 15 : case QScheme::PER_CHANNEL_AFFINE:
575 15 : return width();
576 : break;
577 : default:
578 : break;
579 : }
580 0 : return 0;
581 : }
582 :
583 6 : QScheme CharTensor::q_scheme() const { return qscheme; }
584 :
585 9 : void CharTensor::copy(const void *buf) {
586 9 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
587 : << getName() << " is not contiguous, cannot copy.";
588 :
589 9 : if (buf == getData()) {
590 : return;
591 : }
592 :
593 9 : scopy(size(), (int8_t *)buf, 1, (int8_t *)getData(), 1);
594 :
595 9 : float *scales = (float *)(((int8_t *)buf) + size());
596 9 : scopy(scale_size(), scales, 1, (float *)getScale(), 1);
597 : }
598 :
599 0 : void CharTensor::save_quantization_info(std::ostream &file) {
600 0 : checkedWrite(file, (char *)&qscheme, sizeof(uint16_t),
601 : "[CharTensor::save] failed to write quantization information");
602 0 : }
603 :
604 0 : void CharTensor::read_quantization_info(std::ifstream &file,
605 : size_t start_offset,
606 : bool read_from_offset) {
607 0 : checkedRead(file, (char *)&qscheme, sizeof(uint16_t),
608 : "[CharTensor::read] failed to read quantization information",
609 : start_offset, read_from_offset);
610 0 : }
611 :
612 : } // namespace nntrainer
|