Line data Source code
1 : // SPDX-License-Identifier: The MIT License (MIT)
2 : /**
3 : * Copyright (c) 2017 Facebook Inc.
4 : * Copyright (c) 2017 Georgia Institute of Technology
5 : * Copyright 2019 Google LLC
6 : */
7 : /**
8 : * @file fp16.cpp
9 : * @date 03 Nov 2023
10 : * @brief This is collection of FP16 and FP32 conversion
11 : * @see https://github.com/nnstreamer/nntrainer
12 : * @author Marat Dukhan <maratek@gmail.com>
13 : * @bug No known bugs except for NYI items
14 : *
15 : */
16 :
17 : #include <fp16.h>
18 :
19 : namespace nntrainer {
20 :
21 229450704 : float fp32_from_bits(uint32_t w) {
22 : #if defined(__OPENCL_VERSION__)
23 : return as_float(w);
24 : #elif defined(__CUDA_ARCH__)
25 : return __uint_as_float((unsigned int)w);
26 : #elif defined(__INTEL_COMPILER)
27 : return _castu32_f32(w);
28 : /// @todo resolve unknown identifier error
29 : // #elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
30 : // return _CopyFloatFromInt32((__int32)w);
31 : #else
32 : union {
33 : uint32_t as_bits;
34 : float as_value;
35 : } fp32 = {w};
36 229450704 : return fp32.as_value;
37 : #endif
38 : }
39 :
40 59492001 : uint32_t fp32_to_bits(float f) {
41 : #if defined(__OPENCL_VERSION__)
42 : return as_uint(f);
43 : #elif defined(__CUDA_ARCH__)
44 : return (uint32_t)__float_as_uint(f);
45 : #elif defined(__INTEL_COMPILER)
46 : return _castf32_u32(f);
47 : /// @todo resolve unknown identifier error
48 : // #elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
49 : // return (uint32_t)_CopyInt32FromFloat(f);
50 : #else
51 : union {
52 : float as_value;
53 : uint32_t as_bits;
54 59492001 : } fp32 = {f};
55 59492001 : return fp32.as_bits;
56 : #endif
57 : }
58 :
59 1703460 : uint16_t compute_fp32_to_fp16(float f) {
60 : #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
61 : defined(__GNUC__) && !defined(__STRICT_ANSI__)
62 : const float scale_to_inf = 0x1.0p+112f;
63 : const float scale_to_zero = 0x1.0p-110f;
64 : #else
65 1703460 : const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
66 1703460 : const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
67 : #endif
68 1703460 : float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
69 :
70 1703460 : const uint32_t w = fp32_to_bits(f);
71 1703460 : const uint32_t shl1_w = w + w;
72 : const uint32_t sign = w & UINT32_C(0x80000000);
73 1703460 : uint32_t bias = shl1_w & UINT32_C(0xFF000000);
74 1703460 : if (bias < UINT32_C(0x71000000)) {
75 : bias = UINT32_C(0x71000000);
76 : }
77 :
78 1703460 : base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
79 1703460 : const uint32_t bits = fp32_to_bits(base);
80 1703460 : const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
81 1703460 : const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
82 1703460 : const uint32_t nonsign = exp_bits + mantissa_bits;
83 1703460 : return (sign >> 16) |
84 1703460 : (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
85 : }
86 :
87 56085081 : float compute_fp16_to_fp32(uint16_t h) {
88 56085081 : const uint32_t w = (uint32_t)h << 16;
89 56085081 : const uint32_t sign = w & UINT32_C(0x80000000);
90 56085081 : const uint32_t two_w = w + w;
91 : const uint32_t exp_offset = UINT32_C(0xE0) << 23;
92 : #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
93 : defined(__GNUC__) && !defined(__STRICT_ANSI__)
94 : const float exp_scale = 0x1.0p-112f;
95 : #else
96 56085081 : const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
97 : #endif
98 : const float normalized_value =
99 56085081 : fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
100 :
101 : const uint32_t magic_mask = UINT32_C(126) << 23;
102 : const float magic_bias = 0.5f;
103 : const float denormalized_value =
104 56085081 : fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
105 :
106 : const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
107 : const uint32_t result =
108 56085081 : sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
109 56085052 : : fp32_to_bits(normalized_value));
110 56085081 : return fp32_from_bits(result);
111 : }
112 :
113 : } /* namespace nntrainer */
|