Skip to content

Commit

Permalink
feature flag improvements
Browse files Browse the repository at this point in the history
Signed-off-by: Heinz N. Gies <heinz@licenser.net>
  • Loading branch information
Licenser committed Oct 20, 2023
1 parent 52b94bd commit 008db06
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 45 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ To be able to take advantage of `simd-json` your system needs to be SIMD capable
`simd-json` supports AVX2, SSE4.2 and NEON and simd128 (wasm) natively, it also includes a unoptimized fallback implementation using native rust for other platforms, however this is a last resport measure and nothing we'd recommend relying on.


### Performance characteristics

- CPU native cpu compilation results in the best performance.
- CPU detection for AVX and SSE4.2 is the second fastes (on x86_* only).
- portable std::simd is the next fasted implementaiton when compiled with a native cpu target.
- std::simd or the rust native implementation is the least performant.

### allocator

Expand All @@ -36,6 +42,8 @@ For best performance we highly suggest using [mimalloc](https://crates.io/crates

This feature allowa selecting the optimal algorithn based on availalbe features during runeimte, it has no effect on non x86 or x86_64 platforms. When neither `AVX2` nor `SSE4.2` is spported it will fallback to a native rust implementaiton.

note that a application compiled with `runtime-detection` will not run as fast as an applicaiton compiled for a specific CPU, the reason being is that rust can't optimize as far to the instruction set when it uses the generic instruction set, also non simd parts of the code won't be optimized for the given instruction set either.

### `portable`

**Currently disabled**
Expand Down
24 changes: 18 additions & 6 deletions examples/perf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,24 @@ mod int {
use perfcnt::linux::{HardwareEventType, PerfCounterBuilderLinux};
use perfcnt::{AbstractPerfCounter, PerfCounter};
use serde::{Deserialize, Serialize};
use simd_json::{Deserializer, Implementation};
use std::io::BufReader;

#[derive(Default, Serialize, Deserialize)]
struct Stats {
algo: String,
best: Stat,
total: Stat,
iters: u64,
}
impl Stats {
fn new(algo: Implementation) -> Self {
Stats {
algo: algo.to_string(),
..Default::default()
}
}
}

#[derive(Default, Serialize, Deserialize)]
struct Stat {
Expand Down Expand Up @@ -96,15 +106,16 @@ mod int {
let branch_instructions = self.total.branch_instructions / self.iters;

println!(
"{:20} {:10} {:10} {:10} {:10} {:10} {:10.3} {:10.3}",
"{:20} {:10} {:10} {:10} {:10} {:10} {:10.3} {:10.3} {:21}",
name,
cycles,
instructions,
branch_instructions,
cache_misses,
cache_references,
((self.best.cycles as f64) / bytes as f64),
((cycles as f64) / bytes as f64)
((cycles as f64) / bytes as f64),
self.algo
);
}
pub fn print_diff(&self, baseline: &Stats, name: &str, bytes: usize) {
Expand Down Expand Up @@ -135,7 +146,7 @@ mod int {
}

println!(
"{:20} {:>10} {:>10} {:>10} {:>10} {:>10} {:10} {:10}",
"{:20} {:>10} {:>10} {:>10} {:>10} {:>10} {:10} {:10} {:21}",
format!("{}(+/-)", name),
d((1.0 - cycles_b as f64 / cycles as f64) * 100.0),
d((1.0 - instructions_b as f64 / instructions as f64) * 100.0),
Expand All @@ -144,6 +155,7 @@ mod int {
d((1.0 - cache_references_b as f64 / cache_references as f64) * 100.0),
d((1.0 - best_cycles_per_byte_b as f64 / best_cycles_per_byte as f64) * 100.0),
d((1.0 - cycles_per_byte_b as f64 / cycles_per_byte as f64) * 100.0),
baseline.algo
);
}
}
Expand All @@ -166,7 +178,7 @@ mod int {
for mut bytes in &mut data_entries[..WARMUP as usize] {
simd_json::to_borrowed_value(&mut bytes).unwrap();
}
let mut stats = Stats::default();
let mut stats = Stats::new(Deserializer::algorithm());
for mut bytes in &mut data_entries[WARMUP as usize..] {
// Set up counters
let pc = stats.start();
Expand Down Expand Up @@ -219,8 +231,8 @@ fn main() {
let matches = opts.parse(&args[1..]).unwrap();

println!(
"{:^20} {:^10} {:^21} {:^21} {:^21}",
" ", "", "Instructions", "Cache.", "Cycle/byte"
"{:^20} {:^10} {:^21} {:^21} {:^21} {:21}",
" ", "", "Instructions", "Cache.", "Cycle/byte", "Algorithm"
);
println!(
"{:^20} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10} {:^10}",
Expand Down
110 changes: 71 additions & 39 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ impl<'de> Deserializer<'de> {
}
#[cfg(not(any(
feature = "runtime-detection",
feature = "portable",
target_feature = "avx2",
target_feature = "sse4.2",
target_feature = "simd128",
Expand All @@ -465,14 +466,20 @@ impl<'de> Deserializer<'de> {
/// returns the algorithm / architecture used by the deserializer
#[must_use]
pub fn algorithm() -> Implementation {
#[cfg(feature = "portable")]
let r = Implementation::StdSimd;
#[cfg(not(feature = "portable"))]
let r = Implementation::Native;
r
Implementation::Native
}
#[cfg(all(feature = "portable", not(feature = "runtime-detection")))]
/// returns the algorithm / architecture used by the deserializer
#[must_use]
pub fn algorithm() -> Implementation {
Implementation::StdSimd
}

#[cfg(all(target_feature = "avx2", not(feature = "runtime-detection")))]
#[cfg(all(
target_feature = "avx2",
not(feature = "portable"),
not(feature = "runtime-detection"),
))]
/// returns the algorithm / architecture used by the deserializer
#[must_use]
pub fn algorithm() -> Implementation {
Expand All @@ -481,22 +488,24 @@ impl<'de> Deserializer<'de> {

#[cfg(all(
target_feature = "sse4.2",
not(target_feature = "avx2"),
not(feature = "runtime-detection"),
not(target_feature = "avx2")
not(feature = "portable"),
))]
/// returns the algorithm / architecture used by the deserializer
#[must_use]
pub fn algorithm() -> Implementation {
Implementation::SSE42
}

#[cfg(target_arch = "aarch64")]
#[cfg(all(target_arch = "aarch64", not(feature = "portable")))]
/// returns the algorithm / architecture used by the deserializer
#[must_use]
pub fn algorithm() -> Implementation {
Implementation::NEON
}
#[cfg(target_feature = "simd128")]

#[cfg(all(target_feature = "simd128", not(feature = "portable")))]
/// returns the algorithm / architecture used by the deserializer
#[must_use]
pub fn algorithm() -> Implementation {
Expand Down Expand Up @@ -560,6 +569,7 @@ impl<'de> Deserializer<'de> {
#[inline]
#[cfg(not(any(
feature = "runtime-detection",
feature = "portable",
target_feature = "avx2",
target_feature = "sse4.2",
target_feature = "simd128",
Expand All @@ -575,16 +585,29 @@ impl<'de> Deserializer<'de> {
'de: 'invoke,
{
let input: SillyWrapper<'de> = SillyWrapper::from(input);

#[cfg(feature = "portable")]
let r = impls::portable::parse_str(input, data, buffer, idx);
#[cfg(not(feature = "portable"))]
let r = impls::native::parse_str(input, data, buffer, idx);
r
impls::native::parse_str(input, data, buffer, idx)
}
#[inline]
#[cfg(all(feature = "portable", not(feature = "runtime-detection")))]
pub(crate) unsafe fn parse_str_<'invoke>(
input: *mut u8,
data: &'invoke [u8],
buffer: &'invoke mut [u8],
idx: usize,
) -> Result<&'de str>
where
'de: 'invoke,
{
let input: SillyWrapper<'de> = SillyWrapper::from(input);
impls::portable::parse_str(input, data, buffer, idx)
}

#[inline]
#[cfg(all(target_feature = "avx2", not(feature = "runtime-detection")))]
#[cfg(all(
target_feature = "avx2",
not(feature = "portable"),
not(feature = "runtime-detection"),
))]
pub(crate) unsafe fn parse_str_<'invoke>(
input: *mut u8,
data: &'invoke [u8],
Expand All @@ -598,8 +621,9 @@ impl<'de> Deserializer<'de> {
#[inline]
#[cfg(all(
target_feature = "sse4.2",
not(target_feature = "avx2"),
not(feature = "runtime-detection"),
not(target_feature = "avx2")
not(feature = "portable"),
))]
pub(crate) unsafe fn parse_str_<'invoke>(
input: *mut u8,
Expand All @@ -612,7 +636,7 @@ impl<'de> Deserializer<'de> {
}

#[inline]
#[cfg(target_arch = "aarch64")]
#[cfg(all(target_arch = "aarch64", not(feature = "portable")))]
pub(crate) unsafe fn parse_str_<'invoke>(
input: *mut u8,
data: &'invoke [u8],
Expand All @@ -622,7 +646,7 @@ impl<'de> Deserializer<'de> {
impls::neon::parse_str(input, data, buffer, idx)
}
#[inline]
#[cfg(target_feature = "simd128")]
#[cfg(all(target_feature = "simd128", not(feature = "portable")))]
pub(crate) unsafe fn parse_str_<'invoke>(
input: *mut u8,
data: &'invoke [u8],
Expand Down Expand Up @@ -678,68 +702,76 @@ impl<'de> Deserializer<'de> {
mem::transmute::<FnRaw, FindStructuralBitsFn>(fun)(input, structural_indexes)
}

#[inline]
#[cfg(not(any(
feature = "runtime-detection",
feature = "portable",
target_feature = "avx2",
target_feature = "sse4.2",
target_feature = "simd128",
target_arch = "aarch64",
)))]
#[inline]
pub(crate) unsafe fn find_structural_bits(
input: &[u8],
structural_indexes: &mut Vec<u32>,
) -> std::result::Result<(), ErrorType> {
#[cfg(not(feature = "portable"))]
let r = {
// This is a nasty hack, we don't have a chunked implementation for native rust
// so we validate UTF8 ahead of time
match core::str::from_utf8(input) {
Ok(_) => (),
Err(_) => return Err(ErrorType::InvalidUtf8),
};
#[cfg(not(feature = "portable"))]
Self::_find_structural_bits::<impls::native::SimdInput>(input, structural_indexes)
// This is a nasty hack, we don't have a chunked implementation for native rust
// so we validate UTF8 ahead of time
match core::str::from_utf8(input) {
Ok(_) => (),
Err(_) => return Err(ErrorType::InvalidUtf8),
};
#[cfg(feature = "portable")]
let r =
Self::_find_structural_bits::<impls::portable::SimdInput>(input, structural_indexes);
r
#[cfg(not(feature = "portable"))]
Self::_find_structural_bits::<impls::native::SimdInput>(input, structural_indexes)
}

#[cfg(all(feature = "portable", not(feature = "runtime-detection")))]
#[inline]
#[cfg(all(target_feature = "avx2", not(feature = "runtime-detection")))]
pub(crate) unsafe fn find_structural_bits(
input: &[u8],
structural_indexes: &mut Vec<u32>,
) -> std::result::Result<(), ErrorType> {
Self::_find_structural_bits::<impls::avx2::SimdInput>(input, structural_indexes)
Self::_find_structural_bits::<impls::portable::SimdInput>(input, structural_indexes)
}

#[cfg(all(
target_feature = "avx2",
not(feature = "portable"),
not(feature = "runtime-detection"),
))]
#[inline]
pub(crate) unsafe fn find_structural_bits(
input: &[u8],
structural_indexes: &mut Vec<u32>,
) -> std::result::Result<(), ErrorType> {
Self::_find_structural_bits::<impls::avx2::SimdInput>(input, structural_indexes)
}

#[cfg(all(
target_feature = "sse4.2",
not(target_feature = "avx2"),
not(feature = "runtime-detection"),
not(target_feature = "avx2")
not(feature = "portable"),
))]
#[inline]
pub(crate) unsafe fn find_structural_bits(
input: &[u8],
structural_indexes: &mut Vec<u32>,
) -> std::result::Result<(), ErrorType> {
Self::_find_structural_bits::<impls::sse42::SimdInput>(input, structural_indexes)
}

#[cfg(all(target_arch = "aarch64", not(feature = "portable")))]
#[inline]
#[cfg(target_arch = "aarch64")]
pub(crate) unsafe fn find_structural_bits(
input: &[u8],
structural_indexes: &mut Vec<u32>,
) -> std::result::Result<(), ErrorType> {
Self::_find_structural_bits::<impls::neon::SimdInput>(input, structural_indexes)
}

#[cfg(all(target_feature = "simd128", not(feature = "portable")))]
#[inline]
#[cfg(target_feature = "simd128")]
pub(crate) unsafe fn find_structural_bits(
input: &[u8],
structural_indexes: &mut Vec<u32>,
Expand Down

0 comments on commit 008db06

Please sign in to comment.