Fix undefined behavior in exp10() #69

jart · 2024-05-05T06:20:23Z

Left shifting negative values is undefined behavior.

nsz-arm · 2024-05-21T14:08:08Z

patch looks good, but we need a signed https://github.com/ARM-software/optimized-routines/blob/master/contributor-agreement.pdf to merge it (required by our contribution policy so we can update glibc under our FSF copyright assignment).

alternatively i can do the fix (and attribute the bug report to you) since it is a simple change likely below copyrightability

jart · 2024-05-21T14:26:52Z

Sure I'm printing it out now. Where should I mail the completed form?

nsz-arm · 2024-05-21T17:10:23Z

please send the filled pdf to optimized-routines-assignment@arm.com

nsz-arm · 2024-05-21T17:12:02Z

i scanned image or photo is fine too

jart · 2024-05-21T17:45:45Z

I mailed it!

nsz-arm · 2024-05-22T12:13:13Z

thanks, committed as 88ffd2e

but meanwhile i realized it is better to just copy the code from exp, so followup commit fixed it differently.

jart · 2024-05-22T14:06:52Z

Thanks! It's an honor to be able to call myself a contributor to this project. Do you want these by any chance? I wrote them for x86 based on one of your functions. If so, where should I put them?

// adapted from arm limited optimized routine
// the maximum error is 1.45358 plus 0.5 ulps
// numbers above 88.38 will flush to infinity
// numbers beneath -103.97 will flush to zero
inline static __m512 ggml_v_expf(__m512 x) {
  const __m512 r = _mm512_set1_ps(0x1.8p23f);
  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
  const __m512 n = _mm512_sub_ps(z, r);
  const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
                                    _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
  const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
  const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
  const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
  const __m512 u = _mm512_mul_ps(b, b);
  const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
                                                                   _mm512_set1_ps(0x1.573e2ep-5f)), u,
                                                   _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
                                                                   _mm512_set1_ps(0x1.fffdb6p-2f))),
                                   u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
  if (_mm512_kortestz(c, c))
    return _mm512_fmadd_ps(j, k, k);
  const __m512i g = _mm512_and_si512(
      _mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
      _mm512_set1_epi32(0x82000000u));
  const __m512 s1 =
      _mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
  const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
  const __mmask16 d =
      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
  return _mm512_mask_blend_ps(
      d, _mm512_mask_blend_ps(
          c, _mm512_fmadd_ps(k, j, k),
          _mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
      _mm512_mul_ps(s1, s1));
}

// adapted from arm limited optimized routine
// the maximum error is 1.45358 plus 0.5 ulps
// numbers above 88.38 will flush to infinity
// numbers beneath -103.97 will flush to zero
inline static __m256 ggml_v_expf(__m256 x) {
  const __m256 r = _mm256_set1_ps(0x1.8p23f);
  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
  const __m256 n = _mm256_sub_ps(z, r);
  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
  const __m256 k = _mm256_castsi256_ps(
      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
  const __m256i c = _mm256_castps_si256(
      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
                    _mm256_set1_ps(126), _CMP_GT_OQ));
  const __m256 u = _mm256_mul_ps(b, b);
  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
    return _mm256_fmadd_ps(j, k, k);
  const __m256i g = _mm256_and_si256(
      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
      _mm256_set1_epi32(0x82000000u));
  const __m256 s1 =
      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
  const __m256i d = _mm256_castps_si256(
      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
                    _mm256_set1_ps(192), _CMP_GT_OQ));
  return _mm256_or_ps(
      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
      _mm256_andnot_ps(
          _mm256_castsi256_ps(d),
          _mm256_or_ps(
              _mm256_and_ps(_mm256_castsi256_ps(c),
                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
}

#if defined(__FMA__)
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
#else
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
#endif

// adapted from arm limited optimized routine
// the maximum error is 1.45358 plus 0.5 ulps
// numbers above 88.38 will flush to infinity
// numbers beneath -103.97 will flush to zero
inline static __m128 ggml_v_expf(__m128 x) {
    const __m128 r = _mm_set1_ps(0x1.8p23f);
    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
    const __m128 n = _mm_sub_ps(z, r);
    const __m128 b =
        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
    const __m128i c =
        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
    const __m128 u = _mm_mul_ps(b, b);
    const __m128 j =
        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
    if (!_mm_movemask_epi8(c))
        return MADD128(j, k, k);
    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
                                    _mm_set1_epi32(0x82000000u));
    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
    const __m128i d =
        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
    return _mm_or_ps(
        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
        _mm_andnot_ps(_mm_castsi128_ps(d),
                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
}

zatrazz · 2024-05-22T14:41:24Z

HI @jart , you might want to check whether if these implementation are an improvement for x86_64 libmvec from glibc.

jart · 2024-05-22T17:50:26Z

@zatrazz Sure I'm happy to do that. Could you help me understand how? I spent about an hour trying to figure out how to call their function and haven't had much luck so far.

zatrazz · 2024-05-22T19:33:05Z

@jart the libmvec is quite hard to understand indeed. The x86_64 implementation are at the sysdeps/x86_64/fpu and sysdeps/x86_64/fpu/multiarch folder and start with svml_ string [1] (it is due they originally came from Intel Intrinsics for Short Vector Math Library Operations). The 'svml_s_' are for single-precision while 'svml_d_' are for double, and it uses iFUNC as default.

So the AVX2 version will call svml_s_expf8_core.c iFUNC resolver which would route to svml_s_expf8_core_avx2.S.

We have tests for the libmvec call that are build in the 'math' folder. For instance, the AVX2 is name math/test-float-libmvec-expf-avx2 and you can test it by issuing:

$ make test t=math/test-float-libmvec-expf-avx2

The libmvec tests will start at test-vector-abi.h (which disable the runtime tests if the underlying CPU does not support the vector ABI) and then will issue test-vector-abi-arg1.h (for the case of 1 argument). The tests rely on the OMP simd pragmas to so the compiler will generate the calls vectorized functions (the library is mostly used by compiler generated calls from auto-vectorized loops).

The current x86_64 implementation are all in assembly with only the iFUNC resolvers in C (Intel way...), but ARM did in a simply way for their port of the ARM optimized routines (for instance pow - https://sourceware.org/git/?p=glibc.git;a=commit;h=0fed0b250f728f38bca5f6fba1dcecdccfc6a44e).

jart · 2024-05-22T21:41:27Z

I've never once in my entire life (despite many years of trying) managed to successfully compile glibc. So I expropriated the code manually into a freestanding assembly file https://justine.lol/tmp/libmvec_expf_avx2.S and I got it to run.

My code is faster than glibc libmvec. This isn't a very good benchmark. But here's what I got on Threadripper 7995WX. This measures how many nanoseconds it takes to compute each float in a long array.

   2.72742 ns 2000x run_expf()
   1.34923 ns 2000x run_llamafile_expf_sse2()
   1.34206 ns 2000x run_libmvec_expf_avx2()
   1.17215 ns 2000x run_llamafile_expf_avx2()
   1.18881 ns 2000x run_llamafile_expf_avx512()

I believe the reason why my code is better is because it's able to be inlined. That way, all my _mm256_set1_ps() calls get hoisted to the outer loop. My code also has easy copy/paste-ability. Anyone can easily transplant it into a header-only library.

zatrazz · 2024-05-22T21:52:54Z

It should be as easy as "./configure --prefix=/usr && make -j$(nproc)" (we improved the build experience over the years). We even have a script (scripts/build-many-glibcs.py) that bootstrap a complete compiler for all support ABIs.

And I need to check if we have a proper benchmark to vectorized expf, but to have a more comprehensible comparison I would expect to make the expf version as proper symbol. Although the inline version works as header-only library to embedded within a project, it is not easy to integrate on how glibc libmvec was originally designed (as a libcall for auto-generated calls from the compiler).

zatrazz · 2024-05-22T21:57:31Z

We have also some conformance and maximum expected errors tests, so it would be good if your implementation also adhere to all expected semantics (since some errors code paths can be really costly).

jart · 2024-05-22T22:14:01Z

Here's the benchmark program I wrote, which also proves the correctness of my implementations, i.e. max ulp error is 2. https://gist.github.com/jart/5f5bddd513e338da70b0a81b62f9a116 I think my code could be useful for you to implement your compiler runtime function. In my benchmark, if I add __attribute__((__noinline__)) to my avx2 impl, then it still goes faster than what you're using currently, thanks to the magic of GCC 12.

nsz-arm · 2024-05-23T11:21:23Z

thanks, i think it would be useful to collect such code somewhere, but i think we don't want to maintain x86 only code in this repo. (at some point i tried to write simd code in this repo in a generic way, but there are enough performance and isa differences that it didn't work out.)

sleef is an opensource vector math project maybe they would take it (but they don't directly upstream to glibc libmvec) or you can submit the code to glibc (they nowadays accept sign-off instead of copyright assignment, but you still have to keep working with maintainers to get a patch accepted).

jart · 2024-05-23T16:25:48Z

Well thank you for considering it @nsz-arm. I'll keep an eye out for other opportunities to contribute here, so that hopefully our paths may cross again :-)

Fix undefined behavior in exp10()

280a484

nsz-arm closed this May 22, 2024

jart mentioned this pull request May 22, 2024

ggml : rewrite silu and softmax for cpu ggerganov/llama.cpp#7154

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix undefined behavior in exp10() #69

Fix undefined behavior in exp10() #69

jart commented May 5, 2024

nsz-arm commented May 21, 2024

jart commented May 21, 2024

nsz-arm commented May 21, 2024

nsz-arm commented May 21, 2024

jart commented May 21, 2024

nsz-arm commented May 22, 2024

jart commented May 22, 2024

zatrazz commented May 22, 2024

jart commented May 22, 2024

zatrazz commented May 22, 2024

jart commented May 22, 2024

zatrazz commented May 22, 2024

zatrazz commented May 22, 2024

jart commented May 22, 2024

nsz-arm commented May 23, 2024

jart commented May 23, 2024

Fix undefined behavior in exp10() #69

Fix undefined behavior in exp10() #69

Conversation

jart commented May 5, 2024

nsz-arm commented May 21, 2024

jart commented May 21, 2024

nsz-arm commented May 21, 2024

nsz-arm commented May 21, 2024

jart commented May 21, 2024

nsz-arm commented May 22, 2024

jart commented May 22, 2024

zatrazz commented May 22, 2024

jart commented May 22, 2024

zatrazz commented May 22, 2024

jart commented May 22, 2024

zatrazz commented May 22, 2024

zatrazz commented May 22, 2024

jart commented May 22, 2024

nsz-arm commented May 23, 2024

jart commented May 23, 2024