From 6692ca3bb99bd21b894f12e3c9a54141e1c498b9 Mon Sep 17 00:00:00 2001
From: Chris Jefferson <caj21@st-andrews.ac.uk>
Date: Tue, 26 Feb 2019 11:48:19 +0000
Subject: [PATCH] Move COUNT_TRUE_BLOCKS and COUNT_TRUE_BLOCK back to a public
 header

---
 src/bits_intern.h | 102 ----------------------------------------------
 src/blister.h     | 102 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 102 deletions(-)
diff --git a/src/bits_intern.h b/src/bits_intern.h
index f3bde3df70..1c60e90641 100644
--- a/src/bits_intern.h
+++ b/src/bits_intern.h
@@ -13,108 +13,6 @@
 
 #include "system.h"
 
-/****************************************************************************
-**
-*F  COUNT_TRUES_BLOCK( <block> ) . . . . . . . . . . .  count number of trues
-**
-**  'COUNT_TRUES_BLOCK( <block> )' returns the number of 1 bits in the
-**  UInt <block>. Two implementations are included below. One uses the
-**  gcc builtin __builtin_popcount which usually generates the popcntl
-**  or popcntq instruction on sufficiently recent CPUs. The other uses
-**  the algorithm described in the original comment below:
-**
-**  The sequence to compute the  number of bits in  a block is quite  clever.
-**  The idea is that after the <i>-th instruction each subblock of $2^i$ bits
-**  holds the number of   bits of this  subblock  in the original block  <m>.
-**  This is illustrated in the example below for a block of with 8 bits:
-**
-**       // a b c d e f g h
-**      m = (m & 0x55)       +  ((m >> 1) & 0x55);
-**       // . b . d . f . h  +  . a . c . e . g   =  a+b c+d e+f g+h
-**      m = (m & 0x33)       +  ((m >> 2) & 0x33);
-**       // . . c+d . . g+h  +  . . a+b . . e+f   =  a+b+c+d e+f+g+h
-**      m = (m & 0x0f)       +  ((m >> 4) & 0x0f);
-**       // . . . . e+f+g+h  +  . . . . a+b+c+d   =  a+b+c+d+e+f+g+h
-**
-**  In the actual  code  some unnecessary mask  have  been removed, improving
-**  performance quite a bit,  because masks are 32  bit immediate values  for
-**  which most RISC  processors need two  instructions to load them.  Talking
-**  about performance.  The code is  close to optimal,  it should compile  to
-**  only about  22 MIPS  or SPARC instructions.   Dividing the  block into  4
-**  bytes and looking up the number of bits  of a byte in a  table may be 10%
-**  faster, but only if the table lives in the data cache.
-**
-**  At this time (2017) the optimum choice of implementation for this
-**  function as used seems to be use the gcc builtin on all systems --
-**  but see the comments below in the documentation of
-**  'COUNT_TRUES_BLOCKS'.
-**
-*/
-static inline UInt COUNT_TRUES_BLOCK(UInt block)
-{
-#if USE_POPCNT && defined(HAVE___BUILTIN_POPCOUNTL)
-    return __builtin_popcountl(block);
-#else
-#ifdef SYS_IS_64_BIT
-    block =
-        (block & 0x5555555555555555L) + ((block >> 1) & 0x5555555555555555L);
-    block =
-        (block & 0x3333333333333333L) + ((block >> 2) & 0x3333333333333333L);
-    block = (block + (block >> 4)) & 0x0f0f0f0f0f0f0f0fL;
-    block = (block + (block >> 8));
-    block = (block + (block >> 16));
-    block = (block + (block >> 32)) & 0x00000000000000ffL;
-#else
-    block = (block & 0x55555555) + ((block >> 1) & 0x55555555);
-    block = (block & 0x33333333) + ((block >> 2) & 0x33333333);
-    block = (block + (block >> 4)) & 0x0f0f0f0f;
-    block = (block + (block >> 8));
-    block = (block + (block >> 16)) & 0x000000ff;
-#endif
-    return block;
-#endif
-}
-
-/****************************************************************************
-**
-*F  COUNT_TRUES_BLOCKS( <ptr>, <nblocks> )
-**
-**  'COUNT_TRUES_BLOCKS( <ptr>, <nblocks> )' returns the total number of 1
-**  bits in the array of UInt values starting at <ptr> and including a total
-**  of <nblocks> UInts. The only reason this function is really needed is
-**  that, owing to hardware bugs and compiler peculiarities current in 2017,
-**  (see http://danluu.com/assembly-intrinsics/ or
-**  https://stackoverflow.com/questions/25078285?) manually unrolling this
-**  loop makes the code substantially faster on almost all CPUS.
-**
-**  This interacts strangely with the choice of algorithm for
-**  COUNT_TRUES_BLOCK above. Without the loop unrolling, not using the gcc
-**  builtin is sometimes faster, apparently because it allows the compiler
-**  to unroll the loop and then generate SSE or AVX code to process multiple
-**  words at once. With the loop unrolling the builtin is always faster, and
-**  will itself generate AVX code when compiling for suitable processors.
-**
-**  TODO: monitor this situation periodically.
-*/
-static inline UInt COUNT_TRUES_BLOCKS(const UInt * ptr, UInt nblocks)
-{
-    UInt n = 0;
-    while (nblocks >= 4) {
-        UInt n1 = COUNT_TRUES_BLOCK(*ptr++);
-        UInt n2 = COUNT_TRUES_BLOCK(*ptr++);
-        UInt n3 = COUNT_TRUES_BLOCK(*ptr++);
-        UInt n4 = COUNT_TRUES_BLOCK(*ptr++);
-        n += n1 + n2 + n3 + n4;
-        nblocks -= 4;
-    }
-    while (nblocks) {
-        n += COUNT_TRUES_BLOCK(*ptr++);
-        nblocks--;
-    }
-    // return the number of bits
-    return n;
-}
-
 
 /****************************************************************************
 **
diff --git a/src/blister.h b/src/blister.h
index 493574a72d..b6f4a1f8f0 100644
--- a/src/blister.h
+++ b/src/blister.h
@@ -221,6 +221,108 @@ void AssBlist(Obj list, Int pos, Obj val);
 void ConvBlist(Obj list);
 
 
+/****************************************************************************
+**
+*F  COUNT_TRUES_BLOCK( <block> ) . . . . . . . . . . .  count number of trues
+**
+**  'COUNT_TRUES_BLOCK( <block> )' returns the number of 1 bits in the
+**  UInt <block>. Two implementations are included below. One uses the
+**  gcc builtin __builtin_popcount which usually generates the popcntl
+**  or popcntq instruction on sufficiently recent CPUs. The other uses
+**  the algorithm described in the original comment below:
+**
+**  The sequence to compute the  number of bits in  a block is quite  clever.
+**  The idea is that after the <i>-th instruction each subblock of $2^i$ bits
+**  holds the number of   bits of this  subblock  in the original block  <m>.
+**  This is illustrated in the example below for a block of with 8 bits:
+**
+**       // a b c d e f g h
+**      m = (m & 0x55)       +  ((m >> 1) & 0x55);
+**       // . b . d . f . h  +  . a . c . e . g   =  a+b c+d e+f g+h
+**      m = (m & 0x33)       +  ((m >> 2) & 0x33);
+**       // . . c+d . . g+h  +  . . a+b . . e+f   =  a+b+c+d e+f+g+h
+**      m = (m & 0x0f)       +  ((m >> 4) & 0x0f);
+**       // . . . . e+f+g+h  +  . . . . a+b+c+d   =  a+b+c+d+e+f+g+h
+**
+**  In the actual  code  some unnecessary mask  have  been removed, improving
+**  performance quite a bit,  because masks are 32  bit immediate values  for
+**  which most RISC  processors need two  instructions to load them.  Talking
+**  about performance.  The code is  close to optimal,  it should compile  to
+**  only about  22 MIPS  or SPARC instructions.   Dividing the  block into  4
+**  bytes and looking up the number of bits  of a byte in a  table may be 10%
+**  faster, but only if the table lives in the data cache.
+**
+**  At this time (2017) the optimum choice of implementation for this
+**  function as used seems to be use the gcc builtin on all systems --
+**  but see the comments below in the documentation of
+**  'COUNT_TRUES_BLOCKS'.
+**
+*/
+static inline UInt COUNT_TRUES_BLOCK(UInt block)
+{
+#if USE_POPCNT && defined(HAVE___BUILTIN_POPCOUNTL)
+    return __builtin_popcountl(block);
+#else
+#ifdef SYS_IS_64_BIT
+    block =
+        (block & 0x5555555555555555L) + ((block >> 1) & 0x5555555555555555L);
+    block =
+        (block & 0x3333333333333333L) + ((block >> 2) & 0x3333333333333333L);
+    block = (block + (block >> 4)) & 0x0f0f0f0f0f0f0f0fL;
+    block = (block + (block >> 8));
+    block = (block + (block >> 16));
+    block = (block + (block >> 32)) & 0x00000000000000ffL;
+#else
+    block = (block & 0x55555555) + ((block >> 1) & 0x55555555);
+    block = (block & 0x33333333) + ((block >> 2) & 0x33333333);
+    block = (block + (block >> 4)) & 0x0f0f0f0f;
+    block = (block + (block >> 8));
+    block = (block + (block >> 16)) & 0x000000ff;
+#endif
+    return block;
+#endif
+}
+
+/****************************************************************************
+**
+*F  COUNT_TRUES_BLOCKS( <ptr>, <nblocks> )
+**
+**  'COUNT_TRUES_BLOCKS( <ptr>, <nblocks> )' returns the total number of 1
+**  bits in the array of UInt values starting at <ptr> and including a total
+**  of <nblocks> UInts. The only reason this function is really needed is
+**  that, owing to hardware bugs and compiler peculiarities current in 2017,
+**  (see http://danluu.com/assembly-intrinsics/ or
+**  https://stackoverflow.com/questions/25078285?) manually unrolling this
+**  loop makes the code substantially faster on almost all CPUS.
+**
+**  This interacts strangely with the choice of algorithm for
+**  COUNT_TRUES_BLOCK above. Without the loop unrolling, not using the gcc
+**  builtin is sometimes faster, apparently because it allows the compiler
+**  to unroll the loop and then generate SSE or AVX code to process multiple
+**  words at once. With the loop unrolling the builtin is always faster, and
+**  will itself generate AVX code when compiling for suitable processors.
+**
+**  TODO: monitor this situation periodically.
+*/
+static inline UInt COUNT_TRUES_BLOCKS(const UInt * ptr, UInt nblocks)
+{
+    UInt n = 0;
+    while (nblocks >= 4) {
+        UInt n1 = COUNT_TRUES_BLOCK(*ptr++);
+        UInt n2 = COUNT_TRUES_BLOCK(*ptr++);
+        UInt n3 = COUNT_TRUES_BLOCK(*ptr++);
+        UInt n4 = COUNT_TRUES_BLOCK(*ptr++);
+        n += n1 + n2 + n3 + n4;
+        nblocks -= 4;
+    }
+    while (nblocks) {
+        n += COUNT_TRUES_BLOCK(*ptr++);
+        nblocks--;
+    }
+    // return the number of bits
+    return n;
+}
+
 /****************************************************************************
 **
 *F * * * * * * * * * * * * * initialize module * * * * * * * * * * * * * * *