fix: make avx512 fp16 a runtime check (#1884)

albertlockett · web-flow · commit 628f7a3c5e5f · 2024-01-30T14:19:15.000-05:00
Makes [avx512 fp16](https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide) support a runtime check. This will allow binaries compiled w/ the avx512fp16 feature to run hardware that doesn't support this feature (e.g. x86 before saphire rapids). Check does not add performance penalty: ``` albertlockett@albert-ubuntu-saphire:~/lance/rust/lance-linalg$ TARGET_TIME=55 cargo bench \ --bench dot \ -F avx512fp16 Compiling lance-linalg v0.9.9 (/home/albertlockett/lance/rust/lance-linalg) Finished bench [optimized + debuginfo] target(s) in 55.77s Running benches/dot.rs (/home/albertlockett/lance/rust/target/release/deps/dot-f42dee3ad61e0342) Gnuplot not found, using plotters backend Dot(half::binary16::f16, arrow_artiy) time: [2.5228 s 2.5230 s 2.5233 s] change: [-0.0915% -0.0641% -0.0381%] (p = 0.00 < 0.10) Change within noise threshold. Dot(half::binary16::f16, auto-vectorization) time: [167.90 ms 168.05 ms 168.34 ms] change: [-0.3945% -0.1097% +0.1731%] (p = 0.47 > 0.10) No change in performance detected. Dot(f16, SIMD) time: [167.03 ms 167.22 ms 167.50 ms] change: [-1.4038% -0.9215% -0.4951%] (p = 0.00 < 0.10) Change within noise threshold. ```
diff --git a/rust/lance-core/src/utils.rs b/rust/lance-core/src/utils.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 pub mod address;
+pub mod cpu;
 pub mod deletion;
 pub mod mask;
 pub mod testing;
diff --git a/rust/lance-core/src/utils/cpu.rs b/rust/lance-core/src/utils/cpu.rs
@@ -0,0 +1,40 @@
+// Copyright 2024 Lance Developers.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#[cfg(target_arch = "x86_64")]
+pub mod x86 {
+    use core::arch::x86_64::__cpuid;
+
+    use lazy_static::lazy_static;
+
+    #[inline]
+    fn check_flag(x: usize, position: u32) -> bool {
+        x & (1 << position) != 0
+    }
+
+    lazy_static! {
+        pub static ref AVX512_F16_SUPPORTED: bool = {
+            // this macro does many OS checks/etc. to determine if allowed to use AVX512
+            if !is_x86_feature_detected!("avx512f") {
+                return false;
+            }
+
+            // EAX=7, ECX=0: Extended Features (includes AVX512)
+            // More info on calling CPUID can be found here (section 1.4)
+            // https://www.intel.com/content/dam/develop/external/us/en/documents/architecture-instruction-set-extensions-programming-reference.pdf
+            let ext_cpuid_result = unsafe { __cpuid(7) };
+            check_flag(ext_cpuid_result.edx as usize, 23)
+        };
+    }
+}
diff --git a/rust/lance-linalg/Cargo.toml b/rust/lance-linalg/Cargo.toml
@@ -16,6 +16,7 @@ arrow-schema = { workspace = true }
 futures = { workspace = true }
 half = { workspace = true }
 lance-arrow = { workspace = true }
+lance-core = { workspace = true }
 log = { workspace = true }
 num_cpus = { workspace = true }
 num-traits = { workspace = true }
diff --git a/rust/lance-linalg/benches/dot.rs b/rust/lance-linalg/benches/dot.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use std::iter::{repeat_with, Sum};
+use std::time::Duration;
 
 use arrow_array::{
     types::{Float16Type, Float32Type, Float64Type},
@@ -131,18 +132,26 @@ fn bench_distance(c: &mut Criterion) {
     run_bench::<Float64Type>(c);
 }
 
+fn bench_time() -> Duration {
+    let secs: u64 = option_env!("TARGET_TIME").unwrap_or("5").parse().unwrap();
+    Duration::from_secs(secs)
+}
+
 #[cfg(target_os = "linux")]
 criterion_group!(
     name=benches;
-    config = Criterion::default().significance_level(0.1).sample_size(10)
+    config = Criterion::default()
+        .significance_level(0.1)
+        .sample_size(10)
+        .measurement_time(bench_time())
         .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
     targets = bench_distance);
 
 // Non-linux version does not support pprof.
 #[cfg(not(target_os = "linux"))]
 criterion_group!(
     name=benches;
-    config = Criterion::default().significance_level(0.1).sample_size(10);
+    config = Criterion::default().significance_level(0.1).sample_size(10).measurement_time(bench_time());
     targets = bench_distance);
 
 criterion_main!(benches);
diff --git a/rust/lance-linalg/src/distance/dot.rs b/rust/lance-linalg/src/distance/dot.rs
@@ -28,6 +28,9 @@ use lance_arrow::{ArrowFloatType, FloatArray, FloatToArrayType};
 use num_traits::real::Real;
 use num_traits::AsPrimitive;
 
+#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
+use lance_core::utils::cpu::x86::AVX512_F16_SUPPORTED;
+
 use crate::simd::{
     f32::{f32x16, f32x8},
     SIMD,
@@ -112,13 +115,18 @@ mod kernel {
 impl Dot for Float16Type {
     #[inline]
     fn dot(x: &[f16], y: &[f16]) -> f32 {
-        #[cfg(any(
-            all(target_os = "macos", target_feature = "neon"),
-            all(target_os = "linux", feature = "avx512fp16")
-        ))]
+        #[cfg(all(target_os = "macos", target_feature = "neon"))]
         unsafe {
             kernel::dot_f16(x.as_ptr(), y.as_ptr(), x.len() as u32)
         }
+
+        #[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
+        if *AVX512_F16_SUPPORTED {
+            unsafe { kernel::dot_f16(x.as_ptr(), y.as_ptr(), x.len() as u32) }
+        } else {
+            dot_scalar::<f16, 16>(x, y)
+        }
+
         #[cfg(not(any(
             all(target_os = "macos", target_feature = "neon"),
             all(target_os = "linux", feature = "avx512fp16")
diff --git a/rust/lance-linalg/src/distance/l2.rs b/rust/lance-linalg/src/distance/l2.rs
@@ -29,6 +29,9 @@ use half::{bf16, f16};
 use lance_arrow::{bfloat16::BFloat16Type, ArrowFloatType, FloatArray, FloatToArrayType};
 use num_traits::{AsPrimitive, Float};
 
+#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
+use lance_core::utils::cpu::x86::AVX512_F16_SUPPORTED;
+
 use crate::simd::{
     f32::{f32x16, f32x8},
     SIMD,
@@ -119,13 +122,16 @@ mod kernel {
 impl L2 for Float16Type {
     #[inline]
     fn l2(x: &[f16], y: &[f16]) -> f32 {
-        #[cfg(any(
-            all(target_os = "macos", target_feature = "neon"),
-            all(target_os = "linux", feature = "avx512fp16")
-        ))]
+        #[cfg(all(target_os = "macos", target_feature = "neon"))]
         unsafe {
             kernel::l2_f16(x.as_ptr(), y.as_ptr(), x.len() as u32)
         }
+        #[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
+        if *AVX512_F16_SUPPORTED {
+            unsafe { kernel::l2_f16(x.as_ptr(), y.as_ptr(), x.len() as u32) }
+        } else {
+            l2_scalar::<f16, 16>(x, y)
+        }
         #[cfg(not(any(
             all(target_os = "macos", target_feature = "neon"),
             all(target_os = "linux", feature = "avx512fp16")
diff --git a/rust/lance-linalg/src/distance/norm_l2.rs b/rust/lance-linalg/src/distance/norm_l2.rs
@@ -17,6 +17,9 @@ use std::iter::Sum;
 use half::{bf16, f16};
 use num_traits::{AsPrimitive, Float};
 
+#[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
+use lance_core::utils::cpu::x86::AVX512_F16_SUPPORTED;
+
 use crate::simd::{
     f32::{f32x16, f32x8},
     SIMD,
@@ -45,47 +48,56 @@ mod kernel {
 impl Normalize<f16> for &[f16] {
     // #[inline]
     fn norm_l2(&self) -> f32 {
-        #[cfg(any(
-            all(target_os = "macos", target_feature = "neon"),
-            feature = "avx512fp16"
-        ))]
+        #[cfg(all(target_os = "macos", target_feature = "neon"))]
         unsafe {
             kernel::norm_l2_f16(self.as_ptr(), self.len() as u32)
         }
+
+        #[cfg(all(target_os = "linux", feature = "avx512fp16", target_arch = "x86_64"))]
+        if *AVX512_F16_SUPPORTED {
+            unsafe { kernel::norm_l2_f16(self.as_ptr(), self.len() as u32) }
+        } else {
+            norm_l2_f16_impl(self)
+        }
+
         #[cfg(not(any(
             all(target_os = "macos", target_feature = "neon"),
             feature = "avx512fp16"
         )))]
-        {
-            // Please run `cargo bench --bench norm_l2" on Apple Silicon when
-            // change the following code.
-            const LANES: usize = 16;
-            let chunks = self.chunks_exact(LANES);
-            let sum = if chunks.remainder().is_empty() {
-                0.0
-            } else {
-                chunks
-                    .remainder()
-                    .iter()
-                    .map(|v| v.to_f32().powi(2))
-                    .sum::<f32>()
-            };
-
-            let mut sums: [f32; LANES] = [0_f32; LANES];
-            for chk in chunks {
-                // Convert to f32
-                let mut f32_vals: [f32; LANES] = [0_f32; LANES];
-                for i in 0..LANES {
-                    f32_vals[i] = chk[i].to_f32();
-                }
-                // Vectorized multiply
-                for i in 0..LANES {
-                    sums[i] += f32_vals[i].powi(2);
-                }
-            }
-            (sums.iter().copied().sum::<f32>() + sum).sqrt()
+        norm_l2_f16_impl(self)
+    }
+}
+
+#[inline]
+#[cfg(not(all(target_os = "macos", target_feature = "neon")))]
+fn norm_l2_f16_impl(arr: &[f16]) -> f32 {
+    // Please run `cargo bench --bench norm_l2" on Apple Silicon when
+    // change the following code.
+    const LANES: usize = 16;
+    let chunks = arr.chunks_exact(LANES);
+    let sum = if chunks.remainder().is_empty() {
+        0.0
+    } else {
+        chunks
+            .remainder()
+            .iter()
+            .map(|v| v.to_f32().powi(2))
+            .sum::<f32>()
+    };
+
+    let mut sums: [f32; LANES] = [0_f32; LANES];
+    for chk in chunks {
+        // Convert to f32
+        let mut f32_vals: [f32; LANES] = [0_f32; LANES];
+        for i in 0..LANES {
+            f32_vals[i] = chk[i].to_f32();
+        }
+        // Vectorized multiply
+        for i in 0..LANES {
+            sums[i] += f32_vals[i].powi(2);
         }
     }
+    (sums.iter().copied().sum::<f32>() + sum).sqrt()
 }
 
 impl Normalize<bf16> for &[bf16] {