From 6692ca3bb99bd21b894f12e3c9a54141e1c498b9 Mon Sep 17 00:00:00 2001 From: Chris Jefferson Date: Tue, 26 Feb 2019 11:48:19 +0000 Subject: [PATCH] Move COUNT_TRUE_BLOCKS and COUNT_TRUE_BLOCK back to a public header --- src/bits_intern.h | 102 ---------------------------------------------- src/blister.h | 102 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 102 deletions(-) diff --git a/src/bits_intern.h b/src/bits_intern.h index f3bde3df70..1c60e90641 100644 --- a/src/bits_intern.h +++ b/src/bits_intern.h @@ -13,108 +13,6 @@ #include "system.h" -/**************************************************************************** -** -*F COUNT_TRUES_BLOCK( ) . . . . . . . . . . . count number of trues -** -** 'COUNT_TRUES_BLOCK( )' returns the number of 1 bits in the -** UInt . Two implementations are included below. One uses the -** gcc builtin __builtin_popcount which usually generates the popcntl -** or popcntq instruction on sufficiently recent CPUs. The other uses -** the algorithm described in the original comment below: -** -** The sequence to compute the number of bits in a block is quite clever. -** The idea is that after the -th instruction each subblock of $2^i$ bits -** holds the number of bits of this subblock in the original block . -** This is illustrated in the example below for a block of with 8 bits: -** -** // a b c d e f g h -** m = (m & 0x55) + ((m >> 1) & 0x55); -** // . b . d . f . h + . a . c . e . g = a+b c+d e+f g+h -** m = (m & 0x33) + ((m >> 2) & 0x33); -** // . . c+d . . g+h + . . a+b . . e+f = a+b+c+d e+f+g+h -** m = (m & 0x0f) + ((m >> 4) & 0x0f); -** // . . . . e+f+g+h + . . . . a+b+c+d = a+b+c+d+e+f+g+h -** -** In the actual code some unnecessary mask have been removed, improving -** performance quite a bit, because masks are 32 bit immediate values for -** which most RISC processors need two instructions to load them. Talking -** about performance. The code is close to optimal, it should compile to -** only about 22 MIPS or SPARC instructions. Dividing the block into 4 -** bytes and looking up the number of bits of a byte in a table may be 10% -** faster, but only if the table lives in the data cache. -** -** At this time (2017) the optimum choice of implementation for this -** function as used seems to be use the gcc builtin on all systems -- -** but see the comments below in the documentation of -** 'COUNT_TRUES_BLOCKS'. -** -*/ -static inline UInt COUNT_TRUES_BLOCK(UInt block) -{ -#if USE_POPCNT && defined(HAVE___BUILTIN_POPCOUNTL) - return __builtin_popcountl(block); -#else -#ifdef SYS_IS_64_BIT - block = - (block & 0x5555555555555555L) + ((block >> 1) & 0x5555555555555555L); - block = - (block & 0x3333333333333333L) + ((block >> 2) & 0x3333333333333333L); - block = (block + (block >> 4)) & 0x0f0f0f0f0f0f0f0fL; - block = (block + (block >> 8)); - block = (block + (block >> 16)); - block = (block + (block >> 32)) & 0x00000000000000ffL; -#else - block = (block & 0x55555555) + ((block >> 1) & 0x55555555); - block = (block & 0x33333333) + ((block >> 2) & 0x33333333); - block = (block + (block >> 4)) & 0x0f0f0f0f; - block = (block + (block >> 8)); - block = (block + (block >> 16)) & 0x000000ff; -#endif - return block; -#endif -} - -/**************************************************************************** -** -*F COUNT_TRUES_BLOCKS( , ) -** -** 'COUNT_TRUES_BLOCKS( , )' returns the total number of 1 -** bits in the array of UInt values starting at and including a total -** of UInts. The only reason this function is really needed is -** that, owing to hardware bugs and compiler peculiarities current in 2017, -** (see http://danluu.com/assembly-intrinsics/ or -** https://stackoverflow.com/questions/25078285?) manually unrolling this -** loop makes the code substantially faster on almost all CPUS. -** -** This interacts strangely with the choice of algorithm for -** COUNT_TRUES_BLOCK above. Without the loop unrolling, not using the gcc -** builtin is sometimes faster, apparently because it allows the compiler -** to unroll the loop and then generate SSE or AVX code to process multiple -** words at once. With the loop unrolling the builtin is always faster, and -** will itself generate AVX code when compiling for suitable processors. -** -** TODO: monitor this situation periodically. -*/ -static inline UInt COUNT_TRUES_BLOCKS(const UInt * ptr, UInt nblocks) -{ - UInt n = 0; - while (nblocks >= 4) { - UInt n1 = COUNT_TRUES_BLOCK(*ptr++); - UInt n2 = COUNT_TRUES_BLOCK(*ptr++); - UInt n3 = COUNT_TRUES_BLOCK(*ptr++); - UInt n4 = COUNT_TRUES_BLOCK(*ptr++); - n += n1 + n2 + n3 + n4; - nblocks -= 4; - } - while (nblocks) { - n += COUNT_TRUES_BLOCK(*ptr++); - nblocks--; - } - // return the number of bits - return n; -} - /**************************************************************************** ** diff --git a/src/blister.h b/src/blister.h index 493574a72d..b6f4a1f8f0 100644 --- a/src/blister.h +++ b/src/blister.h @@ -221,6 +221,108 @@ void AssBlist(Obj list, Int pos, Obj val); void ConvBlist(Obj list); +/**************************************************************************** +** +*F COUNT_TRUES_BLOCK( ) . . . . . . . . . . . count number of trues +** +** 'COUNT_TRUES_BLOCK( )' returns the number of 1 bits in the +** UInt . Two implementations are included below. One uses the +** gcc builtin __builtin_popcount which usually generates the popcntl +** or popcntq instruction on sufficiently recent CPUs. The other uses +** the algorithm described in the original comment below: +** +** The sequence to compute the number of bits in a block is quite clever. +** The idea is that after the -th instruction each subblock of $2^i$ bits +** holds the number of bits of this subblock in the original block . +** This is illustrated in the example below for a block of with 8 bits: +** +** // a b c d e f g h +** m = (m & 0x55) + ((m >> 1) & 0x55); +** // . b . d . f . h + . a . c . e . g = a+b c+d e+f g+h +** m = (m & 0x33) + ((m >> 2) & 0x33); +** // . . c+d . . g+h + . . a+b . . e+f = a+b+c+d e+f+g+h +** m = (m & 0x0f) + ((m >> 4) & 0x0f); +** // . . . . e+f+g+h + . . . . a+b+c+d = a+b+c+d+e+f+g+h +** +** In the actual code some unnecessary mask have been removed, improving +** performance quite a bit, because masks are 32 bit immediate values for +** which most RISC processors need two instructions to load them. Talking +** about performance. The code is close to optimal, it should compile to +** only about 22 MIPS or SPARC instructions. Dividing the block into 4 +** bytes and looking up the number of bits of a byte in a table may be 10% +** faster, but only if the table lives in the data cache. +** +** At this time (2017) the optimum choice of implementation for this +** function as used seems to be use the gcc builtin on all systems -- +** but see the comments below in the documentation of +** 'COUNT_TRUES_BLOCKS'. +** +*/ +static inline UInt COUNT_TRUES_BLOCK(UInt block) +{ +#if USE_POPCNT && defined(HAVE___BUILTIN_POPCOUNTL) + return __builtin_popcountl(block); +#else +#ifdef SYS_IS_64_BIT + block = + (block & 0x5555555555555555L) + ((block >> 1) & 0x5555555555555555L); + block = + (block & 0x3333333333333333L) + ((block >> 2) & 0x3333333333333333L); + block = (block + (block >> 4)) & 0x0f0f0f0f0f0f0f0fL; + block = (block + (block >> 8)); + block = (block + (block >> 16)); + block = (block + (block >> 32)) & 0x00000000000000ffL; +#else + block = (block & 0x55555555) + ((block >> 1) & 0x55555555); + block = (block & 0x33333333) + ((block >> 2) & 0x33333333); + block = (block + (block >> 4)) & 0x0f0f0f0f; + block = (block + (block >> 8)); + block = (block + (block >> 16)) & 0x000000ff; +#endif + return block; +#endif +} + +/**************************************************************************** +** +*F COUNT_TRUES_BLOCKS( , ) +** +** 'COUNT_TRUES_BLOCKS( , )' returns the total number of 1 +** bits in the array of UInt values starting at and including a total +** of UInts. The only reason this function is really needed is +** that, owing to hardware bugs and compiler peculiarities current in 2017, +** (see http://danluu.com/assembly-intrinsics/ or +** https://stackoverflow.com/questions/25078285?) manually unrolling this +** loop makes the code substantially faster on almost all CPUS. +** +** This interacts strangely with the choice of algorithm for +** COUNT_TRUES_BLOCK above. Without the loop unrolling, not using the gcc +** builtin is sometimes faster, apparently because it allows the compiler +** to unroll the loop and then generate SSE or AVX code to process multiple +** words at once. With the loop unrolling the builtin is always faster, and +** will itself generate AVX code when compiling for suitable processors. +** +** TODO: monitor this situation periodically. +*/ +static inline UInt COUNT_TRUES_BLOCKS(const UInt * ptr, UInt nblocks) +{ + UInt n = 0; + while (nblocks >= 4) { + UInt n1 = COUNT_TRUES_BLOCK(*ptr++); + UInt n2 = COUNT_TRUES_BLOCK(*ptr++); + UInt n3 = COUNT_TRUES_BLOCK(*ptr++); + UInt n4 = COUNT_TRUES_BLOCK(*ptr++); + n += n1 + n2 + n3 + n4; + nblocks -= 4; + } + while (nblocks) { + n += COUNT_TRUES_BLOCK(*ptr++); + nblocks--; + } + // return the number of bits + return n; +} + /**************************************************************************** ** *F * * * * * * * * * * * * * initialize module * * * * * * * * * * * * * * *