Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * @file short_tensor.cpp
4 : * @date 10 January 2025
5 : * @brief This is ShortTensor class for 16-bit signed integer calculation
6 : * @see https://github.com/nnstreamer/nntrainer
7 : * @author Donghyeon Jeong <dhyeon.jeong@samsung.com>
8 : * @bug No known bugs except for NYI items
9 : */
10 :
11 : #include <iomanip>
12 : #include <iostream>
13 :
14 : #include <cpu_backend.h>
15 : #include <short_tensor.h>
16 : #include <tensor.h>
17 :
18 : namespace nntrainer {
19 :
20 1 : ShortTensor::ShortTensor(std::string name_, Tformat fm, QScheme qscheme_) :
21 2 : TensorBase(name_, fm, Tdatatype::QINT16), qscheme(qscheme_) {}
22 :
23 21 : ShortTensor::ShortTensor(const TensorDim &d, bool alloc_now, Initializer init,
24 21 : std::string name, QScheme qscheme_) :
25 21 : TensorBase(d, alloc_now, init, name), qscheme(qscheme_) {
26 21 : if (alloc_now)
27 21 : allocate();
28 21 : }
29 :
30 6 : ShortTensor::ShortTensor(const TensorDim &d, const void *buf,
31 6 : QScheme qscheme_) :
32 6 : ShortTensor(d, true, Initializer::NONE, "", qscheme_) {
33 6 : if (d.getDataLen() != 0) {
34 6 : if (buf != nullptr)
35 1 : copy(buf);
36 : }
37 6 : }
38 :
39 1 : ShortTensor::ShortTensor(
40 : std::vector<std::vector<std::vector<std::vector<int16_t>>>> const &d,
41 1 : std::vector<float> const &scales, Tformat fm, QScheme qscheme_) :
42 2 : qscheme(qscheme_) {
43 1 : if (d.empty() || d[0].empty() || d[0][0].empty() || d[0][0][0].empty()) {
44 : throw std::out_of_range(
45 0 : "[Tensor] trying to initialize ShortTensor from empty vector");
46 : }
47 :
48 1 : dim.setTensorDim(0, d.size());
49 1 : if (fm == Tformat::NCHW) {
50 1 : dim.setTensorDim(1, d[0].size());
51 1 : dim.setTensorDim(2, d[0][0].size());
52 1 : dim.setTensorDim(3, d[0][0][0].size());
53 : } else {
54 0 : dim.setTensorDim(2, d[0].size());
55 0 : dim.setTensorDim(3, d[0][0].size());
56 0 : dim.setTensorDim(1, d[0][0][0].size());
57 : }
58 :
59 : dim.setTensorType({fm, Tdatatype::QINT16});
60 :
61 1 : strides = dim.computeStrides();
62 1 : contiguous = true;
63 1 : initializer = Initializer::NONE;
64 :
65 : MemoryData *mem_data = new MemoryData(
66 1 : (void *)(new int16_t[dim.getDataLen() +
67 98 : sizeof(float) / sizeof(int16_t) * scale_size()]()));
68 1 : data = std::shared_ptr<MemoryData>(mem_data, [](MemoryData *mem_data) {
69 1 : delete[] mem_data->getAddr<int16_t>();
70 1 : delete mem_data;
71 : });
72 :
73 1 : offset = 0;
74 :
75 : // if fm == Tformat::NCHW, then dim[0] == batch , dim[1] == channel, dim[2]
76 : // == height, dim[3] == width. and if fm == Tformat::NHWC, dim[0] == batch,
77 : // dim[1] == height, dim[2] == width, dim[3] == channel
78 1 : if (fm == Tformat::NCHW) {
79 2 : for (unsigned int i = 0; i < batch(); ++i)
80 4 : for (unsigned int j = 0; j < channel(); ++j)
81 12 : for (unsigned int k = 0; k < height(); ++k)
82 99 : for (unsigned int l = 0; l < width(); ++l)
83 90 : this->setValue(i, j, k, l, d[i][j][k][l]);
84 : } else {
85 0 : for (unsigned int i = 0; i < batch(); ++i)
86 0 : for (unsigned int j = 0; j < height(); ++j)
87 0 : for (unsigned int k = 0; k < width(); ++k)
88 0 : for (unsigned int l = 0; l < channel(); ++l)
89 0 : this->setValue(i, l, j, k, d[i][j][k][l]);
90 : }
91 :
92 : // copy scale factors
93 1 : scopy(scale_size(), scales.data(), 1, (float *)getScale(), 1);
94 1 : }
95 :
96 4 : bool ShortTensor::operator==(const ShortTensor &rhs) const {
97 4 : if (qscheme != rhs.qscheme)
98 : return false;
99 :
100 : // compare quantized data
101 4 : const int16_t *_data = (int16_t *)getData();
102 4 : const int16_t *_rdata = (int16_t *)rhs.getData();
103 474 : for (size_t i = 0; i < size(); ++i) {
104 470 : if (_data[i] != _rdata[i])
105 : return false;
106 : }
107 :
108 : // compare scale factors
109 4 : const float *_scales = (float *)getScale();
110 4 : const float *_rscales = (float *)rhs.getScale();
111 8 : for (size_t i = 0; i < scale_size(); ++i) {
112 4 : if (std::fabs(_scales[i] - _rscales[i]) > 1e-5)
113 : return false;
114 : }
115 :
116 : return true;
117 : }
118 :
119 21 : void ShortTensor::allocate() {
120 21 : if (empty() || data)
121 : return;
122 :
123 21 : if (src_tensor) {
124 : /// allocate data based on the source tensor
125 0 : allocateSrcTensor();
126 : /** as this memory is shared, do NOT initialize */
127 : } else {
128 : /// allocate new memory for the tensor data
129 : MemoryData *mem_data;
130 :
131 : mem_data = new MemoryData(
132 21 : (void *)(new int16_t[dim.getDataLen() +
133 1111 : sizeof(float) / sizeof(int16_t) * scale_size()]{}));
134 21 : data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
135 21 : delete[] mem_data->template getAddr<int16_t>();
136 21 : delete mem_data;
137 : });
138 :
139 21 : offset = 0;
140 21 : initialize();
141 : }
142 : }
143 :
144 0 : void ShortTensor::deallocate() {
145 : data = nullptr;
146 0 : offset = 0;
147 0 : }
148 :
149 770 : void *ShortTensor::getData() const {
150 770 : if (!data)
151 : return nullptr;
152 :
153 : data->validate();
154 770 : return data->getAddr<int16_t>() + offset;
155 : }
156 :
157 1 : void *ShortTensor::getData(size_t idx) const {
158 1 : if (!data)
159 : return nullptr;
160 :
161 : data->validate();
162 1 : return data->getAddr<int16_t>() + offset + idx;
163 : }
164 :
165 15 : void *ShortTensor::getScale() const {
166 15 : if (!data)
167 : return nullptr;
168 :
169 : data->validate();
170 15 : return ((int16_t *)getData()) + size();
171 : }
172 :
173 0 : void *ShortTensor::getScale(size_t idx) const {
174 0 : NNTR_THROW_IF(idx > scale_size(), std::invalid_argument)
175 : << "Tensor::getScale() index is not valid";
176 :
177 0 : if (!data)
178 : return nullptr;
179 :
180 : data->validate();
181 0 : return ((float *)getScale()) + idx;
182 : }
183 :
184 0 : void *ShortTensor::getAddress(unsigned int i) {
185 0 : size_t index = getIndex(batch(), channel(), height(), width());
186 0 : if (i > index) {
187 : return nullptr;
188 : }
189 0 : return &((int16_t *)getData())[i];
190 : }
191 :
192 0 : const void *ShortTensor::getAddress(unsigned int i) const {
193 0 : size_t index = getIndex(batch(), channel(), height(), width());
194 0 : if (i > index) {
195 : return nullptr;
196 : }
197 0 : return &((int16_t *)getData())[i];
198 : }
199 :
200 4 : const int16_t &ShortTensor::getValue(unsigned int i) const {
201 4 : return ((int16_t *)getData())[i];
202 : }
203 :
204 29 : int16_t &ShortTensor::getValue(unsigned int i) {
205 29 : return ((int16_t *)getData())[i];
206 : }
207 :
208 4 : const int16_t &ShortTensor::getValue(unsigned int b, unsigned int c,
209 : unsigned int h, unsigned int w) const {
210 4 : return getValue(getIndex(b, c, h, w));
211 : }
212 :
213 5 : int16_t &ShortTensor::getValue(unsigned int b, unsigned int c, unsigned int h,
214 : unsigned int w) {
215 5 : return getValue(getIndex(b, c, h, w));
216 : }
217 :
218 7 : void ShortTensor::setValue(float value) {
219 7 : int16_t *data = (int16_t *)getData();
220 7 : std::fill(data, data + size(), static_cast<int16_t>(value));
221 7 : }
222 :
223 1 : void ShortTensor::addValue(unsigned int b, unsigned int c, unsigned int h,
224 : unsigned int w, float value, float beta) {
225 1 : auto const &idx = getIndex(b, c, h, w);
226 1 : float output = ((int16_t *)getData())[idx];
227 1 : output *= beta;
228 1 : output += value;
229 :
230 1 : ((int16_t *)getData())[idx] = static_cast<int16_t>(std::trunc(output));
231 1 : }
232 :
233 601 : void ShortTensor::setValue(unsigned int b, unsigned int c, unsigned int h,
234 : unsigned int w, float value) {
235 601 : ((int16_t *)getData())[getIndex(b, c, h, w)] = static_cast<int16_t>(value);
236 601 : }
237 :
238 5 : void ShortTensor::setZero() {
239 : /// @todo replace with apply_i or scal
240 5 : setValue(0);
241 5 : }
242 :
243 24 : void ShortTensor::initialize() {
244 24 : if (empty() || !isAllocated())
245 : return;
246 :
247 : /// @note Sampling from the normal/uniform distribution is invalid
248 24 : switch (initializer) {
249 4 : case Initializer::ZEROS:
250 4 : setZero();
251 4 : break;
252 2 : case Initializer::ONES:
253 2 : setValue(1.0f);
254 2 : break;
255 : case Initializer::NONE:
256 : break;
257 0 : default:
258 0 : throw std::invalid_argument("Initializer not valid for " +
259 0 : getStringDataType());
260 : break;
261 : }
262 :
263 24 : putData();
264 : }
265 :
266 0 : void ShortTensor::initialize(Initializer init) {
267 0 : initializer = init;
268 0 : initialize();
269 0 : }
270 :
271 1 : void ShortTensor::copy(const Tensor &from) {
272 1 : reshape(from.getDim());
273 1 : copy(from.getData());
274 1 : }
275 :
276 0 : void ShortTensor::copyData(const Tensor &from) {
277 0 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
278 : << getName() << " is not contiguous, cannot copy.";
279 :
280 0 : NNTR_THROW_IF(size() != from.size(), std::invalid_argument)
281 : << "Size of tensor to copy must match";
282 :
283 : /// @todo support copy from other data types
284 0 : switch (from.getDataType()) {
285 : case ml::train::TensorDim::DataType::QINT16:
286 0 : copy(from.getData());
287 0 : break;
288 0 : case ml::train::TensorDim::DataType::FP32:
289 0 : copy_fp32(from.size(), from.getData<float>(), (int16_t *)getData());
290 0 : break;
291 0 : default:
292 0 : throw std::invalid_argument("Error: Unsupported data type");
293 : break;
294 : }
295 0 : }
296 :
297 0 : void ShortTensor::copy_with_stride(const Tensor &input, Tensor &output) {
298 0 : for (unsigned int b = 0; b < output.batch(); ++b) {
299 0 : for (unsigned int c = 0; c < output.channel(); ++c) {
300 0 : for (unsigned int h = 0; h < output.height(); ++h) {
301 0 : for (unsigned int w = 0; w < output.width(); ++w) {
302 0 : output.setValue(b, c, h, w, input.getValue<int16_t>(b, c, h, w));
303 : }
304 : }
305 : }
306 : }
307 0 : }
308 :
309 1 : void ShortTensor::save(std::ostream &file) {
310 : /// @note Save quantization information
311 1 : save_quantization_info(file);
312 :
313 1 : std::streamsize sz = static_cast<std::streamsize>(getMemoryBytes());
314 :
315 1 : NNTR_THROW_IF(sz < 0, std::invalid_argument)
316 0 : << "save size: " << getMemoryBytes()
317 : << " is too big. It cannot be represented by std::streamsize";
318 :
319 1 : checkedWrite(file, (char *)getData(), sz,
320 : "[ShortTensor::save] operation failed");
321 1 : putData();
322 1 : }
323 :
324 1 : void ShortTensor::read(std::ifstream &file, size_t start_offset,
325 : bool read_from_offset) {
326 1 : if (start_offset == std::numeric_limits<size_t>::max()) {
327 0 : start_offset = file_offset;
328 : }
329 1 : read_quantization_info(file, start_offset, read_from_offset);
330 :
331 1 : std::streamsize sz = static_cast<std::streamsize>(getMemoryBytes());
332 :
333 1 : NNTR_THROW_IF(sz < 0, std::invalid_argument)
334 0 : << "read size: " << getMemoryBytes()
335 : << " is too big. It cannot be represented by std::streamsize";
336 :
337 1 : if (read_from_offset) {
338 0 : start_offset += sizeof(uint16_t);
339 : }
340 :
341 1 : checkedRead(file, (char *)getData(), sz,
342 : "[ShortTensor::read] operation failed", start_offset,
343 : read_from_offset);
344 1 : putData();
345 1 : }
346 :
347 1 : std::vector<unsigned int> ShortTensor::argmax() const {
348 : std::vector<unsigned int> result;
349 1 : const int16_t *data = (int16_t *)getData();
350 : size_t batch_size = batch();
351 1 : size_t feature_len = dim.getFeatureLen();
352 :
353 1 : result.resize(batch_size);
354 :
355 3 : for (unsigned int b = 0; b < batch_size; b++) {
356 : auto max_iter =
357 2 : std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
358 2 : result[b] = std::distance(data, max_iter) - (b * feature_len);
359 : }
360 1 : return result;
361 0 : }
362 :
363 1 : std::vector<unsigned int> ShortTensor::argmin() const {
364 : std::vector<unsigned int> result;
365 1 : const int16_t *data = (int16_t *)getData();
366 : size_t batch_size = batch();
367 1 : size_t feature_len = dim.getFeatureLen();
368 :
369 1 : result.resize(batch_size);
370 :
371 3 : for (unsigned int b = 0; b < batch_size; b++) {
372 : auto min_iter =
373 2 : std::min_element(data + b * feature_len, data + (b + 1) * feature_len);
374 2 : result[b] = std::distance(data, min_iter) - (b * feature_len);
375 : }
376 1 : return result;
377 0 : }
378 :
379 1 : float ShortTensor::max_abs() const {
380 1 : const int16_t *data = (int16_t *)getData();
381 : unsigned int idx;
382 :
383 1 : int16_t max_val = data[0];
384 4 : for (unsigned int i = 1; i < size(); i += 1) {
385 3 : int16_t cur_val = (data[i] >= 0) ? data[i] : -1 * data[i];
386 3 : if (cur_val > max_val) {
387 : max_val = cur_val;
388 : }
389 : }
390 :
391 1 : return max_val;
392 : }
393 :
394 1 : float ShortTensor::maxValue() const {
395 1 : const int16_t *data = (int16_t *)getData();
396 1 : return *std::max_element(data, data + size());
397 : }
398 :
399 1 : float ShortTensor::minValue() const {
400 1 : const int16_t *data = (int16_t *)getData();
401 1 : return *std::min_element(data, data + size());
402 : }
403 :
404 1 : void ShortTensor::print(std::ostream &out) const {
405 1 : const int16_t *data = (int16_t *)getData();
406 1 : unsigned int len = size();
407 1 : out << "data addr: " << reinterpret_cast<const float *>(data) << '\n';
408 1 : out << dim;
409 :
410 1 : if (len > 512) {
411 0 : out << '[' << (int)data[0] << ' ' << (int)data[1] << ' ' << (int)data[2]
412 0 : << " ... " << (int)data[len - 3] << ' ' << (int)data[len - 2] << ' '
413 0 : << (int)data[len - 1] << ']' << std::endl;
414 0 : return;
415 : }
416 :
417 1 : std::ios init(NULL);
418 1 : init.copyfmt(out);
419 1 : if (getFormat() == Tformat::NCHW) {
420 2 : for (unsigned int k = 0; k < batch(); k++) {
421 2 : for (unsigned int l = 0; l < channel(); l++) {
422 3 : for (unsigned int i = 0; i < height(); i++) {
423 6 : for (unsigned int j = 0; j < width(); j++) {
424 4 : out << std::setw(10) << (int)this->getValue(k, l, i, j) << " ";
425 : }
426 : out << std::endl;
427 : }
428 : out << std::endl;
429 : }
430 : out << "-------" << std::endl;
431 : }
432 : } else {
433 0 : for (unsigned int k = 0; k < batch(); k++) {
434 0 : for (unsigned int i = 0; i < height(); i++) {
435 0 : for (unsigned int j = 0; j < width(); j++) {
436 0 : for (unsigned int l = 0; l < channel(); l++) {
437 0 : out << std::setw(10) << (int)this->getValue(k, l, i, j) << " ";
438 : }
439 : out << std::endl;
440 : }
441 : out << std::endl;
442 : }
443 : out << "-------" << std::endl;
444 : }
445 0 : out.copyfmt(init);
446 : }
447 :
448 : /// print quantization information
449 1 : const float *q_scales = (float *)getScale();
450 :
451 1 : if (scale_size() > 50) {
452 0 : out << "Scale factors: [" << q_scales[0] << ' ' << q_scales[1] << ' '
453 0 : << q_scales[2] << " ... " << q_scales[len - 3] << ' '
454 0 : << q_scales[len - 2] << ' ' << q_scales[len - 1] << ']' << std::endl;
455 : return;
456 : }
457 :
458 1 : out << "Scale factors: ";
459 2 : for (unsigned i = 0; i < scale_size(); ++i) {
460 1 : out << q_scales[i] << " ";
461 : }
462 : out << std::endl;
463 : }
464 :
465 3 : size_t ShortTensor::getMemoryBytes() const {
466 3 : return bytes() + scale_size() * sizeof(float);
467 : }
468 :
469 41 : size_t ShortTensor::scale_size() const {
470 41 : switch (qscheme) {
471 : case QScheme::PER_TENSOR_AFFINE:
472 : return 1;
473 : break;
474 2 : case QScheme::PER_CHANNEL_AFFINE:
475 2 : return height();
476 : break;
477 : default:
478 : break;
479 : }
480 0 : return 0;
481 : }
482 :
483 0 : QScheme ShortTensor::q_scheme() const { return qscheme; }
484 :
485 2 : void ShortTensor::copy(const void *buf) {
486 2 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
487 : << getName() << " is not contiguous, cannot copy.";
488 :
489 2 : if (buf == getData()) {
490 : return;
491 : }
492 :
493 2 : copy_s16(size(), (int16_t *)buf, (int16_t *)getData());
494 :
495 2 : float *scales = (float *)(((int16_t *)buf) + size());
496 2 : scopy(scale_size(), scales, 1, (float *)getScale(), 1);
497 : }
498 :
499 1 : void ShortTensor::save_quantization_info(std::ostream &file) {
500 1 : checkedWrite(file, (char *)&qscheme, sizeof(uint16_t),
501 : "[ShortTensor::save] failed to write quantization information");
502 1 : }
503 1 : void ShortTensor::read_quantization_info(std::ifstream &file,
504 : size_t start_offset,
505 : bool read_from_offset) {
506 1 : checkedRead(file, (char *)&qscheme, sizeof(uint16_t),
507 : "[ShortTensor::read] failed to read quantization information",
508 : start_offset, read_from_offset);
509 1 : }
510 :
511 : } // namespace nntrainer
|