//int main() //{ // int i = 0; // float fVal = -255.123456789; // char *pChar; // pChar = (char *)&fVal; // // for (i = 0; i<4; i++) // { // printf("chs[%d] = %x\n", i, pChar[i]); // } // // pChar[0] = 0; // /*pChar[1] = 0; // pChar[2] = 0; // pChar[3] = 0;*/ // float * pFloat = (float *)pChar; // printf("fVal = %0.9f\n", pFloat[0]); // getchar(); //} // based on https://gist.github.com/martin-kallman/5049614 // float32 // Martin Kallman // // Fast half-precision to single-precision floating point conversion // - Supports signed zero and denormals-as-zero (DAZ) // - Does not support infinities or NaN // - Few, partially pipelinable, non-branching instructions, // - Core opreations ~6 clock cycles on modern x86-64 void float32(float *__restrict out, const short in) { unsigned int t1; unsigned int t2; unsigned int t3; t1 = in & 0x7fffu; // Non-sign bits t2 = in & 0x8000u; // Sign bit t3 = in & 0x7c00u; // Exponent t1 <<= 13u; // Align mantissa on MSB t2 <<= 16u; // Shift sign bit into position t1 += 0x38000000; // Adjust bias t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero t1 |= t2; // Re-insert sign bit *((unsigned int *)out) = t1; }; // float16 // Martin Kallman // // Fast single-precision to half-precision floating point conversion // - Supports signed zero, denormals-as-zero (DAZ), flush-to-zero (FTZ), // clamp-to-max // - Does not support infinities or NaN // - Few, partially pipelinable, non-branching instructions, // - Core opreations ~10 clock cycles on modern x86-64 void float16(short *__restrict out, const float in) { unsigned int inu = *((unsigned int *)& in); unsigned int t1; unsigned int t2; unsigned int t3; t1 = inu & 0x7fffffffu; // Non-sign bits t2 = inu & 0x80000000u; // Sign bit t3 = inu & 0x7f800000u; // Exponent t1 >>= 13u; // Align mantissa on MSB t2 >>= 16u; // Shift sign bit into position t1 -= 0x1c000; // Adjust bias t1 = (t3 < 0x38800000u) ? 0 : t1; // Flush-to-zero t1 = (t3 > 0x8e000000u) ? 0x7bff : t1; // Clamp-to-max t1 = (t3 == 0 ? 0 : t1); // Denormals-as-zero t1 |= t2; // Re-insert sign bit *((short *)out) = t1; }; #define ABS(A) ((A) >= 0 ? (A) : -(A)) int main() { float original = -42.42f; short small = 0; float16(&small, original); float quantized = 0.0f; float32(&quantized, small); float diff = ABS(original - quantized); printf("orig %f quantized %f absdiff %f\n", original, quantized, diff); getchar(); //assert(diff < 0.1f); }
标签:float16,quantized,int,float,t1,互转,diff,original,float32 From: https://www.cnblogs.com/profession/p/16785638.html