-
Notifications
You must be signed in to change notification settings - Fork 99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix undefined behavior in exp10() #69
Conversation
patch looks good, but we need a signed https://github.com/ARM-software/optimized-routines/blob/master/contributor-agreement.pdf to merge it (required by our contribution policy so we can update glibc under our FSF copyright assignment). alternatively i can do the fix (and attribute the bug report to you) since it is a simple change likely below copyrightability |
Sure I'm printing it out now. Where should I mail the completed form? |
please send the filled pdf to optimized-routines-assignment@arm.com |
i scanned image or photo is fine too |
I mailed it! |
thanks, committed as 88ffd2e but meanwhile i realized it is better to just copy the code from exp, so followup commit fixed it differently. |
Thanks! It's an honor to be able to call myself a contributor to this project. Do you want these by any chance? I wrote them for x86 based on one of your functions. If so, where should I put them? // adapted from arm limited optimized routine
// the maximum error is 1.45358 plus 0.5 ulps
// numbers above 88.38 will flush to infinity
// numbers beneath -103.97 will flush to zero
inline static __m512 ggml_v_expf(__m512 x) {
const __m512 r = _mm512_set1_ps(0x1.8p23f);
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
const __m512 n = _mm512_sub_ps(z, r);
const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
const __m512 u = _mm512_mul_ps(b, b);
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
_mm512_set1_ps(0x1.573e2ep-5f)), u,
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
_mm512_set1_ps(0x1.fffdb6p-2f))),
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
if (_mm512_kortestz(c, c))
return _mm512_fmadd_ps(j, k, k);
const __m512i g = _mm512_and_si512(
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
_mm512_set1_epi32(0x82000000u));
const __m512 s1 =
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
const __mmask16 d =
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
return _mm512_mask_blend_ps(
d, _mm512_mask_blend_ps(
c, _mm512_fmadd_ps(k, j, k),
_mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
_mm512_mul_ps(s1, s1));
}
// adapted from arm limited optimized routine
// the maximum error is 1.45358 plus 0.5 ulps
// numbers above 88.38 will flush to infinity
// numbers beneath -103.97 will flush to zero
inline static __m256 ggml_v_expf(__m256 x) {
const __m256 r = _mm256_set1_ps(0x1.8p23f);
const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
const __m256 n = _mm256_sub_ps(z, r);
const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
_mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
const __m256 k = _mm256_castsi256_ps(
_mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
const __m256i c = _mm256_castps_si256(
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
_mm256_set1_ps(126), _CMP_GT_OQ));
const __m256 u = _mm256_mul_ps(b, b);
const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
_mm256_set1_ps(0x1.573e2ep-5f)), u,
_mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
_mm256_set1_ps(0x1.fffdb6p-2f))),
u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
return _mm256_fmadd_ps(j, k, k);
const __m256i g = _mm256_and_si256(
_mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
_mm256_set1_epi32(0x82000000u));
const __m256 s1 =
_mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
const __m256i d = _mm256_castps_si256(
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
_mm256_set1_ps(192), _CMP_GT_OQ));
return _mm256_or_ps(
_mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
_mm256_andnot_ps(
_mm256_castsi256_ps(d),
_mm256_or_ps(
_mm256_and_ps(_mm256_castsi256_ps(c),
_mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
_mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
}
#if defined(__FMA__)
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
#else
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
#endif
// adapted from arm limited optimized routine
// the maximum error is 1.45358 plus 0.5 ulps
// numbers above 88.38 will flush to infinity
// numbers beneath -103.97 will flush to zero
inline static __m128 ggml_v_expf(__m128 x) {
const __m128 r = _mm_set1_ps(0x1.8p23f);
const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
const __m128 n = _mm_sub_ps(z, r);
const __m128 b =
NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
const __m128i c =
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
const __m128 u = _mm_mul_ps(b, b);
const __m128 j =
MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
if (!_mm_movemask_epi8(c))
return MADD128(j, k, k);
const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
_mm_set1_epi32(0x82000000u));
const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
const __m128i d =
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
return _mm_or_ps(
_mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
_mm_andnot_ps(_mm_castsi128_ps(d),
_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
_mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
} |
HI @jart , you might want to check whether if these implementation are an improvement for x86_64 libmvec from glibc. |
@zatrazz Sure I'm happy to do that. Could you help me understand how? I spent about an hour trying to figure out how to call their function and haven't had much luck so far. |
@jart the libmvec is quite hard to understand indeed. The x86_64 implementation are at the sysdeps/x86_64/fpu and sysdeps/x86_64/fpu/multiarch folder and start with svml_ string [1] (it is due they originally came from Intel Intrinsics for Short Vector Math Library Operations). The 'svml_s_' are for single-precision while 'svml_d_' are for double, and it uses iFUNC as default. So the AVX2 version will call svml_s_expf8_core.c iFUNC resolver which would route to svml_s_expf8_core_avx2.S. We have tests for the libmvec call that are build in the 'math' folder. For instance, the AVX2 is name math/test-float-libmvec-expf-avx2 and you can test it by issuing: $ make test t=math/test-float-libmvec-expf-avx2 The libmvec tests will start at test-vector-abi.h (which disable the runtime tests if the underlying CPU does not support the vector ABI) and then will issue test-vector-abi-arg1.h (for the case of 1 argument). The tests rely on the OMP simd pragmas to so the compiler will generate the calls vectorized functions (the library is mostly used by compiler generated calls from auto-vectorized loops). The current x86_64 implementation are all in assembly with only the iFUNC resolvers in C (Intel way...), but ARM did in a simply way for their port of the ARM optimized routines (for instance pow - https://sourceware.org/git/?p=glibc.git;a=commit;h=0fed0b250f728f38bca5f6fba1dcecdccfc6a44e). |
I've never once in my entire life (despite many years of trying) managed to successfully compile glibc. So I expropriated the code manually into a freestanding assembly file https://justine.lol/tmp/libmvec_expf_avx2.S and I got it to run. My code is faster than glibc libmvec. This isn't a very good benchmark. But here's what I got on Threadripper 7995WX. This measures how many nanoseconds it takes to compute each
I believe the reason why my code is better is because it's able to be inlined. That way, all my |
It should be as easy as "./configure --prefix=/usr && make -j$(nproc)" (we improved the build experience over the years). We even have a script (scripts/build-many-glibcs.py) that bootstrap a complete compiler for all support ABIs. And I need to check if we have a proper benchmark to vectorized expf, but to have a more comprehensible comparison I would expect to make the expf version as proper symbol. Although the inline version works as header-only library to embedded within a project, it is not easy to integrate on how glibc libmvec was originally designed (as a libcall for auto-generated calls from the compiler). |
We have also some conformance and maximum expected errors tests, so it would be good if your implementation also adhere to all expected semantics (since some errors code paths can be really costly). |
Here's the benchmark program I wrote, which also proves the correctness of my implementations, i.e. max ulp error is 2. https://gist.github.com/jart/5f5bddd513e338da70b0a81b62f9a116 I think my code could be useful for you to implement your compiler runtime function. In my benchmark, if I add |
thanks, i think it would be useful to collect such code somewhere, but i think we don't want to maintain x86 only code in this repo. (at some point i tried to write simd code in this repo in a generic way, but there are enough performance and isa differences that it didn't work out.) sleef is an opensource vector math project maybe they would take it (but they don't directly upstream to glibc libmvec) or you can submit the code to glibc (they nowadays accept sign-off instead of copyright assignment, but you still have to keep working with maintainers to get a patch accepted). |
Well thank you for considering it @nsz-arm. I'll keep an eye out for other opportunities to contribute here, so that hopefully our paths may cross again :-) |
Left shifting negative values is undefined behavior.