From dc5d7d60ccee73cf3b05a45b43135a87490749ae Mon Sep 17 00:00:00 2001 From: Elliot Gorokhovsky Date: Tue, 28 Jan 2025 13:28:02 -0800 Subject: [PATCH] Make FloatToFloat16 conversion 75x faster using SVE2 instructions (#3626) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3626 X-link: https://github.com/facebookresearch/FBGEMM/pull/703 Rounding was previously (1) not vectorized and (2) [implemented in software](https://fburl.com/code/fa1jzpmo), so speeds were less than 1 byte per cycle. That's really slow. With SVE2 instructions, it's 75x faster (see test plan for measurement). That's due to a combination of vectorization + hardware support for rounding. Differential Revision: D68520774 --- bench/ConvertBenchmark.cc | 2 +- defs.bzl | 14 ++++- include/fbgemm/FbgemmConvert.h | 10 ++++ include/fbgemm/Utils.h | 27 ++++++++++ src/FbgemmFloat16Convert.cc | 4 ++ src/FbgemmFloat16ConvertSVE.cc | 99 ++++++++++++++++++++++++++++++++++ 6 files changed, 153 insertions(+), 3 deletions(-) create mode 100644 src/FbgemmFloat16ConvertSVE.cc diff --git a/bench/ConvertBenchmark.cc b/bench/ConvertBenchmark.cc index 8f4b468c9c..e631788447 100644 --- a/bench/ConvertBenchmark.cc +++ b/bench/ConvertBenchmark.cc @@ -28,7 +28,7 @@ void performance_test() { normal_distribution dist; default_random_engine engine; - cout << setw(4) << "M" << " elements_per_sec_ref" << " elements_per_sec_simd" + cout << setw(4) << "M" << " elements_per_ns_ref" << " elements_per_ns_simd" << endl; array dims{1, 10, 32, 40, 129, 256, 1024, 8000}; diff --git a/defs.bzl b/defs.bzl index 9b8ffc11ec..e69d60bea7 100644 --- a/defs.bzl +++ b/defs.bzl @@ -154,14 +154,24 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False): "src/FbgemmFP16UKernelsSve128.cc", "src/KleidiAIFP16UKernelsNeon.cc", "src/UtilsSve.cc", - ] + ] + select({ + "DEFAULT": [], + "ovr_config//cpu:arm64": [ + "src/FbgemmFloat16ConvertSVE.cc", + ], + }) #FP16 kernels contain inline assembly and inline assembly syntax for MSVC is different. asm_srcs = [ "src/FbgemmFP16UKernelsSve128.cc", "src/KleidiAIFP16UKernelsNeon.cc", "src/UtilsSve.cc", - ] + ] + select({ + "DEFAULT": [], + "ovr_config//cpu:arm64": [ + "src/FbgemmFloat16ConvertSVE.cc", + ], + }) if buck: return select({ "DEFAULT": asm_srcs, diff --git a/include/fbgemm/FbgemmConvert.h b/include/fbgemm/FbgemmConvert.h index 298d539a9b..bb279b2b89 100644 --- a/include/fbgemm/FbgemmConvert.h +++ b/include/fbgemm/FbgemmConvert.h @@ -135,6 +135,16 @@ FBGEMM_API void FloatToFloat16_avx512( size_t size, bool do_clip = false); +/** + * @brief SVE2 implementation to convert fp32 numbers to fp16 numbers. + * + */ +FBGEMM_API void FloatToFloat16_sve2( + const float* src, + float16* dst, + size_t size, + bool do_clip = false); + /** * @brief AVX2 implementation to convert fp16 numbers to fp32 numbers. * diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h index bf0e8e5dd6..6e77881534 100644 --- a/include/fbgemm/Utils.h +++ b/include/fbgemm/Utils.h @@ -18,6 +18,8 @@ #include #include +#include + #ifndef HAVE_SVE #if defined(__aarch64__) && (__GNUC__ >= 8 || __clang_major__ >= 5) && \ __ARM_FEATURE_SVE @@ -466,4 +468,29 @@ void nbit_embedding_sanity_check( } \ } while (0) +class FenvRoundingModeGuard final { + public: + explicit inline FenvRoundingModeGuard(int mode) { + int currentMode = fegetround(); + if (currentMode != mode) { + fesetround(mode); + didUpdate_ = true; + oldMode_ = currentMode; + } + } + inline ~FenvRoundingModeGuard() { + if (didUpdate_) { + fesetround(oldMode_); + } + } + FenvRoundingModeGuard(const FenvRoundingModeGuard&) = delete; + FenvRoundingModeGuard& operator=(const FenvRoundingModeGuard&) = delete; + FenvRoundingModeGuard(FenvRoundingModeGuard&&) = delete; + FenvRoundingModeGuard& operator=(FenvRoundingModeGuard&&) = delete; + + private: + int oldMode_; + bool didUpdate_ = false; +}; + } // namespace fbgemm diff --git a/src/FbgemmFloat16Convert.cc b/src/FbgemmFloat16Convert.cc index d2d3756038..b4694bb9db 100644 --- a/src/FbgemmFloat16Convert.cc +++ b/src/FbgemmFloat16Convert.cc @@ -43,6 +43,10 @@ void FloatToFloat16_simd( FloatToFloat16_avx512(src, dst, size, do_clip); } else if (fbgemmHasAvx2Support()) { FloatToFloat16_avx2(src, dst, size, do_clip); +#ifdef __aarch64__ + } else if (fbgemmHasArmSve2Support()) { + FloatToFloat16_sve2(src, dst, size, do_clip); +#endif } else { FloatToFloat16_ref(src, dst, size, do_clip); return; diff --git a/src/FbgemmFloat16ConvertSVE.cc b/src/FbgemmFloat16ConvertSVE.cc new file mode 100644 index 0000000000..6a48ed8390 --- /dev/null +++ b/src/FbgemmFloat16ConvertSVE.cc @@ -0,0 +1,99 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#if defined(__ARM_FEATURE_SVE2) +#include +#endif +#include + +#define FBGEMM_EXPORTS +#include "fbgemm/FbgemmConvert.h" +#include "fbgemm/Utils.h" + +namespace fbgemm { + +#if defined(__ARM_FEATURE_SVE2) + +namespace { + +// Load two vectors, convert them from fp32 to fp16, store one vector. +void FloatToFloat16KernelSve2_TwoVecs(const float* src, float16* dst) { + const svbool_t pt = svptrue_b16(); + svfloat32x2_t srcVecs = svld2_f32(pt, src); + svfloat16_t even = svcvt_f16_f32_x(pt, svget2(srcVecs, 0)); + svfloat16_t result = svcvtnt_f16_f32_x(even, pt, svget2(srcVecs, 1)); + svst1_f16(pt, reinterpret_cast(dst), result); +} + +// Load and clip two vectors, convert them from fp32 to fp16, store one +// vector. +void FloatToFloat16KernelSve2_TwoVecs_WithClip(const float* src, float16* dst) { + const svbool_t pt = svptrue_b16(); + constexpr float FP16_MAX = 65504.f; + + // Load two vectors + const svfloat32x2_t srcVecs = svld2_f32(pt, src); + svfloat32_t src0 = svget2(srcVecs, 0); + svfloat32_t src1 = svget2(srcVecs, 1); + + // Do the clipping + src0 = svmin_n_f32_x(pt, src0, FP16_MAX); + src0 = svmax_n_f32_x(pt, src0, -FP16_MAX); + src1 = svmin_n_f32_x(pt, src1, FP16_MAX); + src1 = svmax_n_f32_x(pt, src1, -FP16_MAX); + + // Convert fp32 -> fp16 + const svfloat16_t even = svcvt_f16_f32_x(pt, src0); + const svfloat16_t result = svcvtnt_f16_f32_x(even, pt, src1); + + // Store one vector + svst1_f16(pt, reinterpret_cast(dst), result); +} + +} // namespace + +void FloatToFloat16_sve2( + const float* src, + float16* dst, + size_t size, + bool do_clip) { +#pragma STDC FENV_ROUND FE_TONEAREST + const size_t chunkSize = svcntw() * 2; + + // Note: we don't use predicates here, because then we can't use svld2. This + // is not optimal for small buffers, but we already have high overhead on + // small buffers because we have to set fp rounding mode, so I don't care. + if (do_clip) { + size_t i; + for (i = 0; i + chunkSize < size; i += chunkSize) { + FloatToFloat16KernelSve2_TwoVecs_WithClip(src + i, dst + i); + } + FloatToFloat16_ref(src + i, dst + i, size - i, do_clip); + } else { + size_t i; + for (i = 0; i + chunkSize < size; i += chunkSize) { + FloatToFloat16KernelSve2_TwoVecs(src + i, dst + i); + } + FloatToFloat16_ref(src + i, dst + i, size - i, do_clip); + } +} + +#else + +void FloatToFloat16_sve2( + const float* src, + float16* dst, + size_t size, + bool do_clip) { + throw std::runtime_error{ + "CPU supports SVE2 instructions, but you didn't enable SVE2 in your build command. Fix your build!"}; +} + +#endif // defined(__ARM_FEATURE_SVE2) + +} // namespace fbgemm