Skip to content

Commit

Permalink
Move COUNT_TRUE_BLOCKS and COUNT_TRUE_BLOCK back to a public header
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisJefferson authored and wilfwilson committed Feb 26, 2019
1 parent 645371a commit 6692ca3
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 102 deletions.
102 changes: 0 additions & 102 deletions src/bits_intern.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,108 +13,6 @@

#include "system.h"

/****************************************************************************
**
*F COUNT_TRUES_BLOCK( <block> ) . . . . . . . . . . . count number of trues
**
** 'COUNT_TRUES_BLOCK( <block> )' returns the number of 1 bits in the
** UInt <block>. Two implementations are included below. One uses the
** gcc builtin __builtin_popcount which usually generates the popcntl
** or popcntq instruction on sufficiently recent CPUs. The other uses
** the algorithm described in the original comment below:
**
** The sequence to compute the number of bits in a block is quite clever.
** The idea is that after the <i>-th instruction each subblock of $2^i$ bits
** holds the number of bits of this subblock in the original block <m>.
** This is illustrated in the example below for a block of with 8 bits:
**
** // a b c d e f g h
** m = (m & 0x55) + ((m >> 1) & 0x55);
** // . b . d . f . h + . a . c . e . g = a+b c+d e+f g+h
** m = (m & 0x33) + ((m >> 2) & 0x33);
** // . . c+d . . g+h + . . a+b . . e+f = a+b+c+d e+f+g+h
** m = (m & 0x0f) + ((m >> 4) & 0x0f);
** // . . . . e+f+g+h + . . . . a+b+c+d = a+b+c+d+e+f+g+h
**
** In the actual code some unnecessary mask have been removed, improving
** performance quite a bit, because masks are 32 bit immediate values for
** which most RISC processors need two instructions to load them. Talking
** about performance. The code is close to optimal, it should compile to
** only about 22 MIPS or SPARC instructions. Dividing the block into 4
** bytes and looking up the number of bits of a byte in a table may be 10%
** faster, but only if the table lives in the data cache.
**
** At this time (2017) the optimum choice of implementation for this
** function as used seems to be use the gcc builtin on all systems --
** but see the comments below in the documentation of
** 'COUNT_TRUES_BLOCKS'.
**
*/
static inline UInt COUNT_TRUES_BLOCK(UInt block)
{
#if USE_POPCNT && defined(HAVE___BUILTIN_POPCOUNTL)
return __builtin_popcountl(block);
#else
#ifdef SYS_IS_64_BIT
block =
(block & 0x5555555555555555L) + ((block >> 1) & 0x5555555555555555L);
block =
(block & 0x3333333333333333L) + ((block >> 2) & 0x3333333333333333L);
block = (block + (block >> 4)) & 0x0f0f0f0f0f0f0f0fL;
block = (block + (block >> 8));
block = (block + (block >> 16));
block = (block + (block >> 32)) & 0x00000000000000ffL;
#else
block = (block & 0x55555555) + ((block >> 1) & 0x55555555);
block = (block & 0x33333333) + ((block >> 2) & 0x33333333);
block = (block + (block >> 4)) & 0x0f0f0f0f;
block = (block + (block >> 8));
block = (block + (block >> 16)) & 0x000000ff;
#endif
return block;
#endif
}

/****************************************************************************
**
*F COUNT_TRUES_BLOCKS( <ptr>, <nblocks> )
**
** 'COUNT_TRUES_BLOCKS( <ptr>, <nblocks> )' returns the total number of 1
** bits in the array of UInt values starting at <ptr> and including a total
** of <nblocks> UInts. The only reason this function is really needed is
** that, owing to hardware bugs and compiler peculiarities current in 2017,
** (see http://danluu.com/assembly-intrinsics/ or
** https://stackoverflow.com/questions/25078285?) manually unrolling this
** loop makes the code substantially faster on almost all CPUS.
**
** This interacts strangely with the choice of algorithm for
** COUNT_TRUES_BLOCK above. Without the loop unrolling, not using the gcc
** builtin is sometimes faster, apparently because it allows the compiler
** to unroll the loop and then generate SSE or AVX code to process multiple
** words at once. With the loop unrolling the builtin is always faster, and
** will itself generate AVX code when compiling for suitable processors.
**
** TODO: monitor this situation periodically.
*/
static inline UInt COUNT_TRUES_BLOCKS(const UInt * ptr, UInt nblocks)
{
UInt n = 0;
while (nblocks >= 4) {
UInt n1 = COUNT_TRUES_BLOCK(*ptr++);
UInt n2 = COUNT_TRUES_BLOCK(*ptr++);
UInt n3 = COUNT_TRUES_BLOCK(*ptr++);
UInt n4 = COUNT_TRUES_BLOCK(*ptr++);
n += n1 + n2 + n3 + n4;
nblocks -= 4;
}
while (nblocks) {
n += COUNT_TRUES_BLOCK(*ptr++);
nblocks--;
}
// return the number of bits
return n;
}


/****************************************************************************
**
Expand Down
102 changes: 102 additions & 0 deletions src/blister.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,108 @@ void AssBlist(Obj list, Int pos, Obj val);
void ConvBlist(Obj list);


/****************************************************************************
**
*F COUNT_TRUES_BLOCK( <block> ) . . . . . . . . . . . count number of trues
**
** 'COUNT_TRUES_BLOCK( <block> )' returns the number of 1 bits in the
** UInt <block>. Two implementations are included below. One uses the
** gcc builtin __builtin_popcount which usually generates the popcntl
** or popcntq instruction on sufficiently recent CPUs. The other uses
** the algorithm described in the original comment below:
**
** The sequence to compute the number of bits in a block is quite clever.
** The idea is that after the <i>-th instruction each subblock of $2^i$ bits
** holds the number of bits of this subblock in the original block <m>.
** This is illustrated in the example below for a block of with 8 bits:
**
** // a b c d e f g h
** m = (m & 0x55) + ((m >> 1) & 0x55);
** // . b . d . f . h + . a . c . e . g = a+b c+d e+f g+h
** m = (m & 0x33) + ((m >> 2) & 0x33);
** // . . c+d . . g+h + . . a+b . . e+f = a+b+c+d e+f+g+h
** m = (m & 0x0f) + ((m >> 4) & 0x0f);
** // . . . . e+f+g+h + . . . . a+b+c+d = a+b+c+d+e+f+g+h
**
** In the actual code some unnecessary mask have been removed, improving
** performance quite a bit, because masks are 32 bit immediate values for
** which most RISC processors need two instructions to load them. Talking
** about performance. The code is close to optimal, it should compile to
** only about 22 MIPS or SPARC instructions. Dividing the block into 4
** bytes and looking up the number of bits of a byte in a table may be 10%
** faster, but only if the table lives in the data cache.
**
** At this time (2017) the optimum choice of implementation for this
** function as used seems to be use the gcc builtin on all systems --
** but see the comments below in the documentation of
** 'COUNT_TRUES_BLOCKS'.
**
*/
static inline UInt COUNT_TRUES_BLOCK(UInt block)
{
#if USE_POPCNT && defined(HAVE___BUILTIN_POPCOUNTL)
return __builtin_popcountl(block);
#else
#ifdef SYS_IS_64_BIT
block =
(block & 0x5555555555555555L) + ((block >> 1) & 0x5555555555555555L);
block =
(block & 0x3333333333333333L) + ((block >> 2) & 0x3333333333333333L);
block = (block + (block >> 4)) & 0x0f0f0f0f0f0f0f0fL;
block = (block + (block >> 8));
block = (block + (block >> 16));
block = (block + (block >> 32)) & 0x00000000000000ffL;
#else
block = (block & 0x55555555) + ((block >> 1) & 0x55555555);
block = (block & 0x33333333) + ((block >> 2) & 0x33333333);
block = (block + (block >> 4)) & 0x0f0f0f0f;
block = (block + (block >> 8));
block = (block + (block >> 16)) & 0x000000ff;
#endif
return block;
#endif
}

/****************************************************************************
**
*F COUNT_TRUES_BLOCKS( <ptr>, <nblocks> )
**
** 'COUNT_TRUES_BLOCKS( <ptr>, <nblocks> )' returns the total number of 1
** bits in the array of UInt values starting at <ptr> and including a total
** of <nblocks> UInts. The only reason this function is really needed is
** that, owing to hardware bugs and compiler peculiarities current in 2017,
** (see http://danluu.com/assembly-intrinsics/ or
** https://stackoverflow.com/questions/25078285?) manually unrolling this
** loop makes the code substantially faster on almost all CPUS.
**
** This interacts strangely with the choice of algorithm for
** COUNT_TRUES_BLOCK above. Without the loop unrolling, not using the gcc
** builtin is sometimes faster, apparently because it allows the compiler
** to unroll the loop and then generate SSE or AVX code to process multiple
** words at once. With the loop unrolling the builtin is always faster, and
** will itself generate AVX code when compiling for suitable processors.
**
** TODO: monitor this situation periodically.
*/
static inline UInt COUNT_TRUES_BLOCKS(const UInt * ptr, UInt nblocks)
{
UInt n = 0;
while (nblocks >= 4) {
UInt n1 = COUNT_TRUES_BLOCK(*ptr++);
UInt n2 = COUNT_TRUES_BLOCK(*ptr++);
UInt n3 = COUNT_TRUES_BLOCK(*ptr++);
UInt n4 = COUNT_TRUES_BLOCK(*ptr++);
n += n1 + n2 + n3 + n4;
nblocks -= 4;
}
while (nblocks) {
n += COUNT_TRUES_BLOCK(*ptr++);
nblocks--;
}
// return the number of bits
return n;
}

/****************************************************************************
**
*F * * * * * * * * * * * * * initialize module * * * * * * * * * * * * * * *
Expand Down

0 comments on commit 6692ca3

Please sign in to comment.