diff --git a/M2/BUILD/frank/Makefile b/M2/BUILD/frank/Makefile
index 0ab5caed2f5..3ee18ea8cf2 100644
--- a/M2/BUILD/frank/Makefile
+++ b/M2/BUILD/frank/Makefile
@@ -17,6 +17,21 @@ cmake-make-appleclang:
 		-DBUILD_DOCS=on \
 		../../../..
 
+##	CC="`brew --prefix llvm`/bin/clang" CXX="`brew --prefix llvm`/bin/clang++" cmake \
+
+cmake-clang:
+	echo "git branch is " $(BRANCH)
+	mkdir -p builds.tmp/cmake-clang
+	cd builds.tmp/cmake-clang; cmake \
+		-GNinja \
+		-DCMAKE_BUILD_TYPE=RelWithDebInfo \
+		-DBUILD_TESTING=on \
+		-DCMAKE_C_COMPILER="`brew --prefix llvm`/bin/clang" \
+		-DCMAKE_CXX_COMPILER="`brew --prefix llvm`/bin/clang++" \
+		-DBUILD_DOCS=on \
+		-DBUILD_NATIVE=off \
+		../../../..
+
 cmake-appleclang:
 	echo "git branch is " $(BRANCH)
 	mkdir -p builds.tmp/cmake-appleclang
@@ -24,7 +39,7 @@ cmake-appleclang:
 		-GNinja \
 		-DCMAKE_BUILD_TYPE=RelWithDebInfo \
 		-DBUILD_TESTING=on \
-		-DCMAKE_PREFIX_PATH="`brew --prefix libffi`" \
+		-DCMAKE_PREFIX_PATH="`brew --prefix libomp`" \
 		-DBUILD_DOCS=on \
 		-DBUILD_NATIVE=off \
 		../../../..
diff --git a/M2/Macaulay2/e/CMakeLists.txt b/M2/Macaulay2/e/CMakeLists.txt
index f6a0ea96703..72fd6ae7a45 100644
--- a/M2/Macaulay2/e/CMakeLists.txt
+++ b/M2/Macaulay2/e/CMakeLists.txt
@@ -299,7 +299,6 @@ set(SRCLIST
   gb-f4/SPairs
   f4/f4-computation
   f4/f4-m2-interface
-  f4/f4-mem
   f4/f4-monlookup
   f4/f4-spairs
   f4/f4
diff --git a/M2/Macaulay2/e/Makefile.files b/M2/Macaulay2/e/Makefile.files
index 807e8926a6f..f75b786285a 100644
--- a/M2/Macaulay2/e/Makefile.files
+++ b/M2/Macaulay2/e/Makefile.files
@@ -62,7 +62,6 @@ INTERFACE = \
 	gb-f4/PolynomialList \
 	gb-f4/SPairs \
 	f4/f4 \
-	f4/f4-mem \
 	f4/f4-monlookup \
 	f4/f4-computation \
 	f4/f4-spairs \
diff --git a/M2/Macaulay2/e/VectorArithmetic.hpp b/M2/Macaulay2/e/VectorArithmetic.hpp
index e78edaf19fa..36fe3840956 100644
--- a/M2/Macaulay2/e/VectorArithmetic.hpp
+++ b/M2/Macaulay2/e/VectorArithmetic.hpp
@@ -16,7 +16,6 @@
 // #include "aring-glue.hpp"           // for ConcreteRing
 // #include "aring.hpp"                // for DummyRing, ring_GFFlintBig, ring_...
 // #include "buffer.hpp"               // for buffer
-// #include "f4/f4-mem.hpp"            // for F4Vec
 // #include "ring.hpp"                 // for Ring
 // #include "ringelem.hpp"             // for ring_elem
 
@@ -42,7 +41,6 @@
 #include "aring-glue.hpp"
 #include <variant>
 #include <type_traits>
-#include "f4/f4-mem.hpp"         // for F4Mem
 
 class Ring;
 using ComponentIndex = int;
@@ -296,8 +294,7 @@ class ConcreteVectorArithmetic
                            ElementArray& sparse, // output value: sets this value
                            int*& comps,
                            int first,
-                           int last,
-                           F4Vec& f4Vec) const
+                           int last) const
   {
     auto& dvec = * elementArray(dense);
     
@@ -308,7 +305,8 @@ class ConcreteVectorArithmetic
     for (int i = first; i >= 0 and i <= last; i++)
 	if (not mRing->is_zero(dvec[i])) len++;
 
-    comps = f4Vec.allocate(len);
+    //comps = f4Vec.allocate(len);
+    comps = new int[len];
 
     sparse = allocateElementArray(len);
     auto& svec = * elementArray(sparse);
@@ -716,9 +714,8 @@ class VectorArithmetic
                            ElementArray& coeffs, // sets coeffs
                            int*& comps, // sets comps
                            int first,
-                           int last,
-                           F4Vec& f4Vec) const {
-    std::visit([&](auto& arg) { arg->denseToSparse(dense,coeffs,comps,first,last,f4Vec); }, mConcreteVector);  
+                           int last) const {
+    std::visit([&](auto& arg) { arg->denseToSparse(dense,coeffs,comps,first,last); }, mConcreteVector);  
   }
 
   template<typename LockType>
diff --git a/M2/Macaulay2/e/comp-gb.cpp b/M2/Macaulay2/e/comp-gb.cpp
index 10665f46bee..a8fe293aa4f 100644
--- a/M2/Macaulay2/e/comp-gb.cpp
+++ b/M2/Macaulay2/e/comp-gb.cpp
@@ -20,12 +20,14 @@ GBComputation *createF4GB(const Matrix *m,
                           M2_arrayint gb_weights,
                           int strategy,
                           M2_bool use_max_degree,
-                          int max_degree);
+                          int max_degree,
+                          int numThreads);
 
 // Found in "gb-f4/GBF4Interface.hpp"
 GBComputation *createGBF4Interface(const Matrix *m,
                                   const std::vector<int>& weights,
-                                  int strategy
+                                  int strategy,
+                                  int numThreads
                                   );
 
 GBComputation::~GBComputation() {}
@@ -42,6 +44,7 @@ GBComputation *GBComputation::choose_gb(const Matrix *m,
                                         int max_degree,
                                         int algorithm,
                                         int strategy,
+                                        int numThreads,
                                         int max_reduction_count)
 {
   const Ring *R1 = m->get_ring();
@@ -105,7 +108,8 @@ GBComputation *GBComputation::choose_gb(const Matrix *m,
                             gb_weights,
                             strategy,
                             use_max_degree,
-                            max_degree);
+                            max_degree,
+                            numThreads);
         break;
       case 7:
         result = binomialGB_comp::create(m,
@@ -120,10 +124,12 @@ GBComputation *GBComputation::choose_gb(const Matrix *m,
         ERROR("Algorithm => Test has been removed from M2");
         return nullptr;
       case 9:
+        // new GBF4 algorithm
         weights = M2_arrayint_to_stdvector<int>(gb_weights);
         result = createGBF4Interface(m,
                             weights,
-                            strategy);
+                            strategy,
+                            numThreads);
         break;
       default:
         result = gbA::create(m,
diff --git a/M2/Macaulay2/e/comp-gb.hpp b/M2/Macaulay2/e/comp-gb.hpp
index 43f429ee32e..f5f39727fc9 100644
--- a/M2/Macaulay2/e/comp-gb.hpp
+++ b/M2/Macaulay2/e/comp-gb.hpp
@@ -49,6 +49,7 @@ class GBComputation : public Computation
                                   int max_degree,
                                   int algorithm,
                                   int strategy,
+                                  int numThreads,
                                   int max_reduction_count = 10);
   // Values for algorithm and strategy are documented in engine.h
   // Returns NULL if an error occurs
diff --git a/M2/Macaulay2/e/f4/f4-computation.cpp b/M2/Macaulay2/e/f4/f4-computation.cpp
index ec99c8644e2..f9051cab1b9 100644
--- a/M2/Macaulay2/e/f4/f4-computation.cpp
+++ b/M2/Macaulay2/e/f4/f4-computation.cpp
@@ -5,7 +5,6 @@
 #include "buffer.hpp"              // for buffer
 #include "error.h"                 // for ERROR
 #include "f4/f4-m2-interface.hpp"  // for F4toM2Interface
-#include "f4/f4-mem.hpp"           // for F4Mem
 #include "f4/f4-types.hpp"         // for gb_array, gbelem
 #include "f4/f4.hpp"               // for F4GB
 #include "f4/moninfo.hpp"          // for MonomialInfo
@@ -28,48 +27,44 @@ GBComputation *createF4GB(const Matrix *m,
                           M2_arrayint gb_weights,
                           int strategy,
                           M2_bool use_max_degree,
-                          int max_degree)
+                          int max_degree,
+                          int numThreads)
 {
   const PolynomialRing *R = m->get_ring()->cast_to_PolynomialRing();
   const Ring *K = R->getCoefficients();
-  F4Mem *Mem = new F4Mem;
   auto vectorArithmetic = new VectorArithmetic(K);
   // TODO: code here used to detect whether R, K is a valid ring here
 
   GBComputation *G;
   G = new F4Computation(vectorArithmetic,
-                        Mem,
                         m,
                         collect_syz,
                         n_rows_to_keep,
                         gb_weights,
                         strategy,
                         use_max_degree,
-                        max_degree);
+                        max_degree,
+                        numThreads);
   return G;
 }
 
 F4Computation::F4Computation(const VectorArithmetic* VA,
-                             F4Mem *Mem0,
                              const Matrix *m,
                              M2_bool collect_syz,
                              int n_rows_to_keep,
                              M2_arrayint gb_weights,
                              int strategy,
                              M2_bool use_max_degree,
-                             int max_degree)
+                             int max_degree,
+                             int numThreads)
   : mFreeModule(m->rows()),
-    mVectorArithmetic(VA),
-    mMemoryBlock(Mem0)
+    mVectorArithmetic(VA)
 {
-  // Note: the F4Mem which K0 uses should be mMemoryBlock. ??TODO: no longer valid, still containing useful info?
   mOriginalRing = m->get_ring()->cast_to_PolynomialRing();
   mMonoid = new MonomialInfo(mOriginalRing->n_vars(),
                         mOriginalRing->getMonoid()->getMonomialOrdering());
 
-
   mF4GB = new F4GB(mVectorArithmetic,
-                   Mem0,
                    mMonoid,
                    m->rows(),
                    collect_syz,
@@ -77,13 +72,14 @@ F4Computation::F4Computation(const VectorArithmetic* VA,
                    gb_weights,
                    strategy,
                    use_max_degree,
-                   max_degree);
+                   max_degree,
+                   numThreads);
 
   F4toM2Interface::from_M2_matrix(mVectorArithmetic, mMonoid, m, gb_weights, mF4GB->get_generators());
   mF4GB->new_generators(0, m->n_cols() - 1);
 }
 
-F4Computation::~F4Computation() { delete mMemoryBlock; }
+F4Computation::~F4Computation() = default;
 /*************************
  ** Top level interface **
  *************************/
@@ -169,7 +165,6 @@ void F4Computation::show() const  // debug display
   stash::stats(o);
   emit(o.str());
 
-  mMemoryBlock->show();
   // f4->show();
 }
 
diff --git a/M2/Macaulay2/e/f4/f4-computation.hpp b/M2/Macaulay2/e/f4/f4-computation.hpp
index ec3eb4b1cb9..d9e2b5a569c 100644
--- a/M2/Macaulay2/e/f4/f4-computation.hpp
+++ b/M2/Macaulay2/e/f4/f4-computation.hpp
@@ -9,7 +9,6 @@
 #include "interface/computation.h"  // for ComputationStatusCode
 #include "polyring.hpp"             // for PolynomialRing
 class Computation;
-class F4Mem;
 class FreeModule;
 class Matrix;
 class MonomialInfo;
@@ -29,18 +28,17 @@ class F4Computation : public GBComputation
                                    // Also determines degrees of elements in F.
   const VectorArithmetic* mVectorArithmetic;
   MonomialInfo *mMonoid;
-  F4Mem *mMemoryBlock;
   F4GB *mF4GB;
  public:
   F4Computation(const VectorArithmetic* VA,
-                F4Mem *Mem,
                 const Matrix *m,
                 M2_bool collect_syz,
                 int n_rows_to_keep,
                 M2_arrayint gb_weights,
                 int strategy,
                 M2_bool use_max_degree,
-                int max_degree);
+                int max_degree,
+                int numThreads);
 
   ~F4Computation() override;
 
@@ -90,7 +88,8 @@ GBComputation *createF4GB(const Matrix *m,
                           M2_arrayint gb_weights,
                           int strategy,
                           M2_bool use_max_degree,
-                          int max_degree);
+                          int max_degree,
+                          int numThreads);
 
 #endif
 
diff --git a/M2/Macaulay2/e/f4/f4-mem.cpp b/M2/Macaulay2/e/f4/f4-mem.cpp
deleted file mode 100644
index 05b03cb02ab..00000000000
--- a/M2/Macaulay2/e/f4/f4-mem.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "f4/f4-mem.hpp"
-#include <cstdio>
-
-const unsigned int doublesize[DOUBLESIZE] = {
-    12,        24,        48,        96,         192,        384,
-    768,       1536,      3072,      6144,       12288,      24576,
-    49152,     98304,     196608,    393216,     786432,     1572864,
-    3145728,   6291456,   12582912,  25165824,   50331648,   100663296,
-    201326592, 402653184, 805306368, 1610612736, 3221225472U};
-
-void F4Vec::show()
-{
-  fprintf(stderr, "memory usage for %s.  nreallocs = %d\n", name, nreallocs);
-  fprintf(stderr,
-          "index\t\tsize\t\tnalloc\t\tndealloc\t\thighwater\tcurrent\n");
-  for (int i = 0; i < DOUBLESIZE; i++)
-    if (nallocs[i] > 0)
-      {
-        fprintf(stderr,
-                "%4u\t%12u\t%12u\t%12u\t\t%12u\t%12u\n",
-                i,
-                doublesize[i],
-                nallocs[i],
-                ndeallocs[i],
-                highwater[i],
-                current[i]);
-      }
-}
-
-F4Mem::F4Mem()
-    : monom_alloc(0),
-      monom_total(0),
-      monom_freed(0),
-      monom_dealloc(0),
-      monom_highwater(0),
-      monom_current(0),
-      components("comps"),
-      coefficients("coeffs")
-{
-}
-
-void F4Mem::show()
-{
-  fprintf(stderr,
-          "class\tnalloc\tndealloc\thighwater\ttotal\tfreed\tcurrent\n");
-  fprintf(stderr,
-          "monom\t%ld\t%ld\t\t%ld\t\t%ld\t%ld\t%ld\n",
-          monom_alloc,
-          monom_dealloc,
-          monom_highwater,
-          monom_total,
-          monom_freed,
-          monom_current);
-  components.show();
-  coefficients.show();
-}
-
-// Local Variables:
-// compile-command: "make -C $M2BUILDDIR/Macaulay2/e "
-// indent-tabs-mode: nil
-// End:
diff --git a/M2/Macaulay2/e/f4/f4-mem.hpp b/M2/Macaulay2/e/f4/f4-mem.hpp
deleted file mode 100644
index 9101e415054..00000000000
--- a/M2/Macaulay2/e/f4/f4-mem.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright 2007 Michael E. Stillman
-
-#ifndef __f4mem_h_
-#define __f4mem_h_
-
-#include "newdelete.hpp"
-#include "f4/moninfo.hpp"  // only for monomial_word
-
-typedef int *f4vec;
-
-#define DOUBLESIZE 30
-extern const unsigned int doublesize[DOUBLESIZE];
-
-#define SIZE_MASK 0x07ffffff
-#define STASH_SHIFT (8 * sizeof(int) - 5)
-#define STASH(a) (a >> STASH_SHIFT)
-
-class F4Vec
-{
-  // Format for the items pointed to:
-  //  a[-1] = (top 5 bits: which stash this belongs to)
-  //           (rest of the bits: current size of the array)
-  const char *name;
-
-  int nreallocs;
-  int nallocs[DOUBLESIZE];
-  int highwater[DOUBLESIZE];
-  int current[DOUBLESIZE];
-  int ndeallocs[DOUBLESIZE];
-
-  int find_alloc_size(int sz)
-  {
-    sz++;
-    int s = 0;
-    while (doublesize[s] < sz) s++;
-    return s;
-  }
-
- public:
-  void reset()
-  {
-    nreallocs = 0;
-    for (int i = 0; i < DOUBLESIZE; i++) nallocs[i] = 0;
-    for (int i = 0; i < DOUBLESIZE; i++) highwater[i] = 0;
-    for (int i = 0; i < DOUBLESIZE; i++) current[i] = 0;
-    for (int i = 0; i < DOUBLESIZE; i++) ndeallocs[i] = 0;
-  }
-
-  F4Vec(const char *name0) : name(name0) { reset(); }
-  ~F4Vec() { name = nullptr; }
-  int size(f4vec a) { return (SIZE_MASK & (a[-1])); }
-  int alloc_stash(f4vec a) { return (a[-1]) >> STASH_SHIFT; }
-  f4vec allocate(int alloc_sz)
-  {
-    if (alloc_sz == 0) return nullptr;
-    int s = find_alloc_size(alloc_sz);
-    int nextlen = doublesize[s];
-    nallocs[s]++;
-    current[s]++;
-    if (highwater[s] < current[s]) highwater[s] = current[s];
-    int *result = newarray_atomic(int, nextlen);
-    result[0] = (s << STASH_SHIFT) | alloc_sz;
-    return (result + 1);
-  }
-
-  void deallocate(f4vec &a)
-  {
-    if (a == nullptr) return;
-    int s = alloc_stash(a);
-    ndeallocs[s]++;
-    current[s]--;
-    freemem(a - 1);
-    a = nullptr;
-  }
-
-  void clear(f4vec &a)
-  {
-    // Don't change the amount allocated, but set the size back to 0.
-    // (and zero out the array (or place garbage in there...)
-    int sz = size(a);
-    for (int i = sz; i > 0; i--) a[i] = 0x03030303;
-    a[-1] -= sz;
-  }
-
-  void reallocate(f4vec &a, int newsize)
-  {
-    nreallocs++;
-    f4vec newa = allocate(newsize);
-    int oldsize = size(a);
-    int *oldp = a;
-    int *p = newa;
-    for (int i = oldsize; i > 0; i--) *p++ = *oldp++;
-    newa[-1] += (newsize - oldsize);
-    deallocate(a);
-    a = p;
-  }
-
-  void resize(f4vec &a, int newsize)
-  {
-    // grow 'a' to have size 'newsize'.
-    // If the size becomes too large for a to handle
-    // then reallocation is done.
-    int s = alloc_stash(a);
-    if (doublesize[s] < newsize)
-      reallocate(a, newsize);
-    else
-      a[-1] += (newsize - size(a));
-  }
-
-  void grow(f4vec &a, int ntoadd) { resize(a, size(a) + ntoadd); }
-  void show();
-};
-
-class F4Mem : public our_new_delete
-{
-  long monom_alloc;
-  long monom_total;  // number of ints total asked for
-  long monom_freed;  // number of ints deallocated
-  long monom_dealloc;
-  long monom_highwater;
-  long monom_current;
-
- public:
-  F4Vec components;
-  F4Vec coefficients;
-
-  F4Mem();
-
-  monomial_word *allocate_monomial_array(int len)
-  {
-    if (len == 0) return nullptr;
-    monom_alloc++;
-    monom_total += len;
-    monom_current += len;
-    if (monom_highwater < monom_current) monom_highwater = monom_current;
-    monomial_word *result = newarray_atomic(monomial_word, len);
-    return result;
-  }
-
-  void deallocate_monomial_array(monomial_word *&a, int len)
-  {
-    if (len == 0) return;
-    monom_dealloc++;
-    monom_freed += len;
-    monom_current -= len;
-    freemem(a);
-    a = nullptr;
-  }
-
-  void show();
-};
-
-#endif
-
-// Local Variables:
-// compile-command: "make -C $M2BUILDDIR/Macaulay2/e "
-// indent-tabs-mode: nil
-// End:
diff --git a/M2/Macaulay2/e/f4/f4-monlookup.cpp b/M2/Macaulay2/e/f4/f4-monlookup.cpp
index 6d4a4cab824..c0d1874d6af 100644
--- a/M2/Macaulay2/e/f4/f4-monlookup.cpp
+++ b/M2/Macaulay2/e/f4/f4-monlookup.cpp
@@ -4,12 +4,9 @@
 #include "buffer.hpp"                // for buffer
 #include "engine-exports.h"          // for newline
 #include "f4/varpower-monomial.hpp"  // for varpower_word, const_varpower_mo...
-#include "mem.hpp"                   // for stash
-#include "style.hpp"                 // for INTSIZE
 #include "text-io.hpp"               // for emit, emit_line
 
 #include <cassert>                   // for assert
-#include <gc/gc_allocator.h>         // for gc_allocator
 #include <cstdint>                   // for int32_t
 #include <vector>                    // for vector, vector<>::iterator
 #include <algorithm>
@@ -20,7 +17,7 @@ F4MonomialLookupTableT<Key>::new_mi_node(varpower_word v,
                                          varpower_word e,
                                          mi_node *d)
 {
-  mi_node *p = reinterpret_cast<mi_node *>(mi_stash->new_elem());
+  mi_node *p = new mi_node;
   p->var = v;
   p->exp = e;
   p->left = nullptr;
@@ -37,7 +34,7 @@ F4MonomialLookupTableT<Key>::new_mi_node(varpower_word v,
                                          varpower_word e,
                                          Key k)
 {
-  mi_node *p = reinterpret_cast<mi_node *>(mi_stash->new_elem());
+  mi_node *p = new mi_node;
   p->var = v;
   p->exp = e;
   p->left = nullptr;
@@ -57,32 +54,23 @@ void F4MonomialLookupTableT<Key>::delete_mi_node(mi_node *p)
     {
       if (p->header != p) delete_mi_node(p->down());
     }
-  mi_stash->delete_elem(p);
+  delete p;
 }
 
 template <typename Key>
-F4MonomialLookupTableT<Key>::F4MonomialLookupTableT(int nvars, stash *mi_stash0)
+F4MonomialLookupTableT<Key>::F4MonomialLookupTableT(int nvars)
 {
   count = 0;
-  mi_stash = mi_stash0;
-  if (mi_stash == nullptr)
-    {
-      count = 1;
-      mi_stash = new stash("mi_node", sizeof(mi_node));
-    }
-
   size_of_exp = nvars;
-  exp0 = newarray_atomic_clear(ntuple_word, size_of_exp);
+  exp0 = new ntuple_word[size_of_exp];
+  std::fill(exp0, exp0 + size_of_exp, 0);
 }
 
 template <typename Key>
 F4MonomialLookupTableT<Key>::~F4MonomialLookupTableT()
 {
-  for (typename VECTOR(mi_node *)::const_iterator i = mis.begin();
-       i != mis.end();
-       i++)
-    delete_mi_node(*i);
-  if ((count % 2) == 1) delete mi_stash;
+  for (auto& i : mis) delete_mi_node(i);
+  delete [] exp0;
 }
 
 template <typename Key>
@@ -90,7 +78,7 @@ void F4MonomialLookupTableT<Key>::insert1(mi_node *&top,
                                           const_varpower_monomial b,
                                           Key k)
 {
-  count += 2;
+  count++;
   mi_node **p = &top, *up = nullptr;
   bool one_element = true;
 
@@ -204,8 +192,8 @@ bool F4MonomialLookupTableT<Key>::find_one_divisor1(mi_node *mi,
 template <typename Key>
 void F4MonomialLookupTableT<Key>::find_all_divisors1(mi_node *mi,
                                                      const_ntuple_monomial exp,
-                                                     VECTOR(Key) &
-                                                         result_k) const
+                                                     std::vector<Key>& result_k
+                                                     ) const
 {
   mi_node *p = mi;
 
@@ -244,13 +232,14 @@ void F4MonomialLookupTableT<Key>::update_expvector(
   if (size_of_exp <= nvars)
     {
       // Increase size of exponent vector
-      freemem(exp0);
+      delete [] exp0;
       if (nvars > 2 * size_of_exp)
         size_of_exp = nvars;
       else
         size_of_exp *= 2;
 
-      exp0 = newarray_atomic_clear(ntuple_word, size_of_exp);
+      exp0 = new ntuple_word[size_of_exp];
+      for (auto i=0; i<size_of_exp; ++i) exp0[i] = 0;
     }
 
   int nparts = static_cast<int>(*m++);
@@ -291,7 +280,7 @@ template <typename Key>
 void F4MonomialLookupTableT<Key>::find_all_divisors_vp(
     long comp,
     const_varpower_monomial m,
-    VECTOR(Key) & result_k) const
+    std::vector<Key> & result_k) const
 {
   if (comp >= mis.size()) return;
   mi_node *mi = mis[comp];
@@ -322,7 +311,7 @@ template <typename Key>
 void F4MonomialLookupTableT<Key>::find_all_divisors_packed(
     const MonomialInfo *M,
     const_packed_monomial m,
-    VECTOR(Key) & result_k) const
+    std::vector<Key> & result_k) const
 {
   long comp = M->get_component(m);
   if (comp >= mis.size()) return;
@@ -337,10 +326,11 @@ void F4MonomialLookupTableT<Key>::insert_minimal_vp(long comp,
                                                     const_varpower_monomial m,
                                                     Key k)
 {
-  if (comp >= mis.size())
-    {
-      for (long j = comp - mis.size(); j >= 0; j--) mis.push_back(nullptr);
-    }
+  while (comp >= mis.size()) mis.push_back(nullptr);
+  // if (comp >= mis.size())
+  //   {
+  //     for (long j = comp - mis.size(); j >= 0; j--) mis.push_back(nullptr);
+  //   }
   insert1(mis[comp], m, k);
 }
 
@@ -446,10 +436,8 @@ void F4MonomialLookupTableT<Key>::debug_out(int disp) const
   nnodes = 0;
   nleaves = 0;
   ndepth = 0;
-  for (typename VECTOR(mi_node *)::const_iterator i = mis.begin();
-       i != mis.end();
-       i++)
-    if (*i != nullptr) do_tree(*i, 0, 0, disp);
+  for (auto& i : mis)
+    if (i != nullptr) do_tree(i, 0, 0, disp);
   buffer o;
   o << "list nodes     = " << nlists << newline;
   o << "internal nodes = " << nnodes << newline;
@@ -506,26 +494,22 @@ template <typename Key>
 void F4MonomialLookupTableT<Key>::debug_check() const
 {
   int nfound = 0;
-  for (typename VECTOR(mi_node *)::const_iterator i = mis.begin();
-       i != mis.end();
-       i++)
+  for (auto& i : mis)
     {
-      if (*i != nullptr) nfound += debug_check(*i, nullptr);
+      if (i != nullptr) nfound += debug_check(i, nullptr);
     }
-  assert(count / 2 == nfound);
+  assert(count == nfound);
 }
 
 template <typename Key>
 void F4MonomialLookupTableT<Key>::text_out(buffer &o) const
 {
   o << "F4MonomialLookupTableT (";
-  o << count / 2 << " entries)\n";
+  o << count << " entries)\n";
   int a = 0;
-  for (typename VECTOR(mi_node *)::const_iterator i = mis.begin();
-       i != mis.end();
-       i++)
+  for (auto& i : mis)
     {
-      for (mi_node *p = *i; p != nullptr; p = next(p))
+      for (mi_node *p = i; p != nullptr; p = next(p))
         {
           if ((++a) % 15 == 0) o << newline;
           o << p->key() << "  ";
@@ -533,9 +517,8 @@ void F4MonomialLookupTableT<Key>::text_out(buffer &o) const
     }
 }
 
-void minimalize_varpower_monomials(const VECTOR(varpower_monomial) & elems,
-                                   VECTOR(int) & result_minimals,
-                                   stash *mi_stash)
+void minimalize_varpower_monomials(const std::vector<varpower_monomial> & elems,
+                                   std::vector<int> & result_minimals)
 {
   std::vector<std::pair<int, int>> degs_and_indices;
   int count = 0;
@@ -547,7 +530,7 @@ void minimalize_varpower_monomials(const VECTOR(varpower_monomial) & elems,
     }
   std::stable_sort(degs_and_indices.begin(), degs_and_indices.end());
 
-  F4MonomialLookupTableT<int> M(10, mi_stash); // 10 is simply a suggested value
+  F4MonomialLookupTableT<int> M(10); // 10 is simply a suggested value
   for (auto p : degs_and_indices)
     {
       int k; // unused: we only care if there is a divisor, not which index it has
diff --git a/M2/Macaulay2/e/f4/f4-monlookup.hpp b/M2/Macaulay2/e/f4/f4-monlookup.hpp
index 00a665377be..549e1db68a6 100644
--- a/M2/Macaulay2/e/f4/f4-monlookup.hpp
+++ b/M2/Macaulay2/e/f4/f4-monlookup.hpp
@@ -6,15 +6,13 @@
 #include "f4/moninfo.hpp"            // for MonomialInfo (ptr only), const_p...
 #include "f4/ntuple-monomial.hpp"    // for const_ntuple_monomial, ntuple_word
 #include "f4/varpower-monomial.hpp"  // for const_varpower_monomial, varpowe...
-#include "newdelete.hpp"             // for VECTOR, our_new_delete
 
 class buffer;
-class stash;
 
 template <typename Key>
-class F4MonomialLookupTableT : public our_new_delete
+class F4MonomialLookupTableT
 {
-  struct mi_node : public our_new_delete  // monomial ideal internal node ///
+  struct mi_node  // monomial ideal internal node ///
   {
     varpower_word var;
     varpower_word exp;
@@ -40,8 +38,7 @@ class F4MonomialLookupTableT : public our_new_delete
     }
   };
 
-  stash *mi_stash;
-  VECTOR(mi_node *) mis;
+  std::vector<mi_node *> mis;
   int count;
 
   int size_of_exp;  // in ints, size of exp0
@@ -61,12 +58,12 @@ class F4MonomialLookupTableT : public our_new_delete
 
   void find_all_divisors1(mi_node *mi,
                           const_ntuple_monomial exp,
-                          VECTOR(Key) & result_k) const;
+                          std::vector<Key>& result_k) const;
 
   void insert1(mi_node *&p, const_varpower_monomial m, Key k);
 
  public:
-  F4MonomialLookupTableT(int nvars, stash *mi_stash = nullptr);
+  F4MonomialLookupTableT(int nvars);
   ~F4MonomialLookupTableT();
 
   //  // Should we write these two routines?
@@ -98,11 +95,11 @@ class F4MonomialLookupTableT : public our_new_delete
 
   void find_all_divisors_vp(long comp,
                             const_varpower_monomial m,
-                            VECTOR(Key) & result_k) const;
+                            std::vector<Key> & result_k) const;
 
   void find_all_divisors_packed(const MonomialInfo *M,
                                 const_packed_monomial m,
-                                VECTOR(Key) & result_k) const;
+                                std::vector<Key> & result_k) const;
   // Search. Return a vector of all keys corresponding to
   // monomials which divide m.
 
@@ -121,9 +118,8 @@ class F4MonomialLookupTableT : public our_new_delete
   void debug_check() const;
 };
 
-void minimalize_varpower_monomials(const VECTOR(varpower_monomial) & elems,
-                                   VECTOR(int) & result_minimals,
-                                   stash *mi_stash = nullptr);
+void minimalize_varpower_monomials(const std::vector<varpower_monomial> & elems,
+                                   std::vector<int> & result_minimals);
 
 #endif
 
diff --git a/M2/Macaulay2/e/f4/f4-spairs.cpp b/M2/Macaulay2/e/f4/f4-spairs.cpp
index 64dfa6cce4d..7ee53bfefe4 100644
--- a/M2/Macaulay2/e/f4/f4-spairs.cpp
+++ b/M2/Macaulay2/e/f4/f4-spairs.cpp
@@ -11,15 +11,25 @@
 #include <algorithm>          // for stable_sort
 #include <vector>             // for vector, vector<>::iterator
 
+#if defined(WITH_TBB)
+F4SPairSet::F4SPairSet(const MonomialInfo *M0, const gb_array &gb0, mtbb::task_arena& scheduler)
+#else
 F4SPairSet::F4SPairSet(const MonomialInfo *M0, const gb_array &gb0)
-    : M(M0), gb(gb0), heap(nullptr), this_set(nullptr), nsaved_unneeded(0)
+#endif
+  : M(M0),
+    gb(gb0),
+    mSPairCompare(mSPairs),
+    mSPairQueue(mSPairCompare),
+    nsaved_unneeded(0),
+    mMinimizePairsSeconds(0)
+#if defined(WITH_TBB)
+  , mScheduler(scheduler)
+#endif
 {
   max_varpower_size = 2 * M->n_vars() + 1;
 
-  spair *used_to_determine_size = nullptr;
-  size_t spair_size =
-      sizeofspair(used_to_determine_size, M->max_monomial_size());
-  spair_stash = new stash("F4 spairs", spair_size);
+  //spair *used_to_determine_size = nullptr;
+  //mSPairSizeInBytes = sizeofspair(used_to_determine_size, M->max_monomial_size());
 }
 
 F4SPairSet::~F4SPairSet()
@@ -27,20 +37,6 @@ F4SPairSet::~F4SPairSet()
   // Deleting the stash deletes all memory used here
   // PS, VP are deleted automatically.
   M = nullptr;
-  heap = nullptr;
-  this_set = nullptr;
-  delete spair_stash;
-}
-
-spair *F4SPairSet::make_spair(spair_type type, int deg, int i, int j)
-{
-  spair *result = reinterpret_cast<spair *>(spair_stash->new_elem());
-  result->next = nullptr;
-  result->type = type;
-  result->deg = deg;
-  result->i = i;
-  result->j = j;
-  return result;
 }
 
 void F4SPairSet::insert_spair(pre_spair *p, int me)
@@ -49,27 +45,43 @@ void F4SPairSet::insert_spair(pre_spair *p, int me)
   int deg = p->deg1 + gb[me]->deg;
   // int me_component = M->get_component(gb[me]->f.monoms);
 
-  spair *result = make_spair(F4_SPAIR_SPAIR, deg, me, j);
-
-  M->from_varpower_monomial(p->quot, 0, result->lcm);
-  M->unchecked_mult(result->lcm, gb[me]->f.monoms, result->lcm);
-
-  result->next = heap;
-  heap = result;
+  spair result {SPairType::SPair, deg, me, j, nullptr};
+  
+  auto allocRange = mSPairLCMs.allocateArray<monomial_word>(M->max_monomial_size());
+  result.lcm = allocRange.first;
+  
+  M->from_varpower_monomial(p->quot, 0, result.lcm);
+  M->unchecked_mult(result.lcm, gb[me]->f.monoms, result.lcm);
+
+  mSPairLCMs.shrinkLastAllocate(allocRange.first,
+                                allocRange.second,
+                                allocRange.first + M->monomial_size(result.lcm));
+
+  auto sPairIndex = mSPairs.size();
+  mSPairs.push_back(result);  
+  mSPairQueue.push(sPairIndex);
+  
 }
 
-void F4SPairSet::delete_spair(spair *p) { spair_stash->delete_elem(p); }
+void F4SPairSet::delete_spair(spair *p) { delete p; }
 void F4SPairSet::insert_generator(int deg, packed_monomial lcm, int col)
 {
-  spair *p = make_spair(F4_SPAIR_GEN, deg, col, -1);
-  M->copy(lcm, p->lcm);
-  p->next = heap;
-  heap = p;
+  spair result {SPairType::Generator,deg,col,-1,nullptr};
+
+  auto allocRange = mSPairLCMs.allocateArray<monomial_word>(M->monomial_size(lcm));
+  result.lcm = allocRange.first;
+  
+  M->copy(lcm, result.lcm);
+
+  auto sPairIndex = mSPairs.size();
+  mSPairs.push_back(result);  
+  mSPairQueue.push(sPairIndex);
 }
 
 bool F4SPairSet::pair_not_needed(spair *p, gbelem *m)
 {
-  if (p->type != F4_SPAIR_SPAIR && p->type != F4_SPAIR_RING) return false;
+  // if (p->type != SPairType::SPair && p->type != SPairType::Ring) return false;
+  if (p->type != SPairType::SPair) return false;
   if (M->get_component(p->lcm) != M->get_component(m->f.monoms)) return false;
   return M->unnecessary(
       m->f.monoms, gb[p->i]->f.monoms, gb[p->j]->f.monoms, p->lcm);
@@ -81,98 +93,85 @@ int F4SPairSet::remove_unneeded_pairs()
   // and do so.  Return the number removed.
 
   // MES: Check the ones in this_set? Probably not needed...
-  spair head;
-  spair *p = &head;
+
+  if (gb.size() == 0) return 0;
+  
   gbelem *m = gb[gb.size() - 1];
-  int nremoved = 0;
-
-  head.next = heap;
-  while (p->next != nullptr)
-    if (pair_not_needed(p->next, m))
-      {
-        nremoved++;
-        spair *tmp = p->next;
-        p->next = tmp->next;
-        tmp->next = nullptr;
-        delete_spair(tmp);
-      }
-    else
-      p = p->next;
-  heap = head.next;
-  return nremoved;
+  //long nremoved = 0;
+
+#if defined(WITH_TBB)
+  mScheduler.execute([&] {
+    mtbb::parallel_for(mtbb::blocked_range<int>{0, INTSIZE(mSPairs)},
+                       [&](const mtbb::blocked_range<int>& r)
+                       {
+                         for (auto i = r.begin(); i != r.end(); ++i)
+                           {
+                             if (pair_not_needed(&mSPairs[i],m))
+                               {
+                                 mSPairs[i].type = SPairType::Retired;
+                               }
+                           }
+                       });
+  });
+#else  
+  for (auto& p : mSPairs)
+  {
+    if (pair_not_needed(&p,m))
+    {
+      p.type = SPairType::Retired;
+      //++nremoved;
+    }
+  }
+#endif  
+  return 0;
+  //return nremoved;
+  
 }
 
-int F4SPairSet::determine_next_degree(int &result_number)
+std::pair<bool,int> F4SPairSet::setThisDegree()
 {
-  spair *p;
-  int nextdeg;
-  int len = 1;
-  if (heap == nullptr)
+  if (mSPairQueue.empty()) return {false, 0};
+
+  auto queueTop = mSPairQueue.top();
+  while (mSPairs[queueTop].type == SPairType::Retired)
     {
-      result_number = 0;
-      return 0;
+      mSPairQueue.pop();
+      if (mSPairQueue.empty()) return {false, 0};
+      queueTop = mSPairQueue.top();
     }
-  nextdeg = heap->deg;
-  for (p = heap->next; p != nullptr; p = p->next)
-    if (p->deg > nextdeg)
-      continue;
-    else if (p->deg < nextdeg)
-      {
-        len = 1;
-        nextdeg = p->deg;
-      }
-    else
-      len++;
-  result_number = len;
-  return nextdeg;
-}
 
-int F4SPairSet::prepare_next_degree(int max, int &result_number)
-// Returns the (sugar) degree being done next, and collects all (or at
-// most 'max', if max>0) spairs in this lowest degree.
-// Returns the degree, sets result_number.
-{
-  this_set = nullptr;
-  int result_degree = determine_next_degree(result_number);
-  if (result_number == 0) return 0;
-  if (max > 0 && max < result_number) result_number = max;
-  int len = result_number;
-  spair head;
-  spair *p;
-  head.next = heap;
-  p = &head;
-  while (p->next != nullptr)
-    if (p->next->deg != result_degree)
-      p = p->next;
-    else
-      {
-        spair *tmp = p->next;
-        p->next = tmp->next;
-        tmp->next = this_set;
-        this_set = tmp;
-        len--;
-        if (len == 0) break;
-      }
-  heap = head.next;
-  return result_degree;
+  mThisDegree = mSPairs[queueTop].deg;
+  return {true,mThisDegree};
+
 }
 
-spair *F4SPairSet::get_next_pair()
+//spair *F4SPairSet::get_next_pair()
+std::pair<bool,spair> F4SPairSet::get_next_pair()
 // get the next pair in this degree (the one 'prepare_next_degree' set up')
-// returns 0 if at the end
 {
-  spair *result;
-  if (!this_set) return nullptr;
+  if (mSPairQueue.empty()) return {false, {}};
+  auto result = mSPairQueue.top();
+  if (mSPairs[result].deg != mThisDegree) return {false, {} };
+  mSPairQueue.pop();
+  return {true,mSPairs[result]};
+}
 
-  result = this_set;
-  this_set = this_set->next;
-  result->next = nullptr;
-  return result;
+void F4SPairSet::discardSPairsInCurrentDegree()
+{
+  while (not mSPairQueue.empty())
+    {
+      auto result = mSPairQueue.top();
+      if (mSPairs[result].deg != mThisDegree) return;
+      mSPairs[result].type = SPairType::Retired;
+      mSPairQueue.pop();
+    }
 }
 
 int F4SPairSet::find_new_pairs(bool remove_disjoints)
 // returns the number of new pairs found
 {
+  // this is used for "late" removal of spairs -- will need to be reworked
+  //   in the new priority_queue approach
   nsaved_unneeded += remove_unneeded_pairs();
   int len = construct_pairs(remove_disjoints);
   return len;
@@ -181,7 +180,7 @@ int F4SPairSet::find_new_pairs(bool remove_disjoints)
 void F4SPairSet::display_spair(spair *p)
 // A debugging routine which displays an spair
 {
-  if (p->type == F4_SPAIR_SPAIR)
+  if (p->type == SPairType::SPair)
     {
       fprintf(stderr, "[%d %d deg %d lcm ", p->i, p->j, p->deg);
       M->show(p->lcm);
@@ -196,7 +195,8 @@ void F4SPairSet::display_spair(spair *p)
 void F4SPairSet::display()
 // A debugging routine which displays the spairs in the set
 {
-  fprintf(stderr, "spair set\n");
+  /*
+    fprintf(stderr, "spair set\n");
   for (spair *p = heap; p != nullptr; p = p->next)
     {
       fprintf(stderr, "   ");
@@ -208,6 +208,7 @@ void F4SPairSet::display()
       fprintf(stderr, "   ");
       display_spair(p);
     }
+  */
 }
 
 ////////////////////////////////
@@ -222,7 +223,7 @@ pre_spair *F4SPairSet::create_pre_spair(int j)
   pre_spair *result = PS.allocate();
   result->quot = VP.reserve(max_varpower_size);
   result->j = j;
-  result->type = F4_SPAIR_SPAIR;
+  result->type = SPairType::SPair;;
   M->quotient_as_vp(gb[j]->f.monoms,
                     gb[gb.size() - 1]->f.monoms,
                     result->quot,
@@ -233,7 +234,7 @@ pre_spair *F4SPairSet::create_pre_spair(int j)
   return result;
 }
 
-void insert_pre_spair(VECTOR(VECTOR(pre_spair *)) & bins, pre_spair *p)
+void insert_pre_spair(std::vector<std::vector<pre_spair *>> & bins, pre_spair *p)
 {
   int d = p->deg1;
   if (d >= bins.size()) bins.resize(d + 1);
@@ -251,10 +252,10 @@ int F4SPairSet::construct_pairs(bool remove_disjoints)
   gbelem *me = gb[gb.size() - 1];
   int me_component = static_cast<int>(M->get_component(me->f.monoms));
 
-  typedef VECTOR(pre_spair *) spairs;
-
-  VECTOR(VECTOR(pre_spair *)) bins;
+  std::vector<std::vector<pre_spair *>> bins;
 
+  mtbb::tick_count t0 {mtbb::tick_count::now()};
+  
   // Loop through each element of gb, and create the pre_spair
   for (int i = 0; i < gb.size() - 1; i++)
     {
@@ -264,6 +265,9 @@ int F4SPairSet::construct_pairs(bool remove_disjoints)
       insert_pre_spair(bins, p);
     }
 
+  mtbb::tick_count t1 {mtbb::tick_count::now()};
+  mPrePairsSeconds += (t1-t0).seconds();  
+
   ////////////////////////////
   // Now minimalize the set //
   ////////////////////////////
@@ -279,9 +283,9 @@ int F4SPairSet::construct_pairs(bool remove_disjoints)
       std::stable_sort(bins[i].begin(), bins[i].end(), C);
 
       // Loop through each degree and potentially insert...
-      spairs::iterator first = bins[i].begin();
-      spairs::iterator next = first;
-      spairs::iterator end = bins[i].end();
+      auto first = bins[i].begin();
+      auto next = first;
+      auto end = bins[i].end();
       for (; first != end; first = next)
         {
           next = first + 1;
@@ -310,10 +314,82 @@ int F4SPairSet::construct_pairs(bool remove_disjoints)
         }
     }
   delete montab;
+  mtbb::tick_count t2 {mtbb::tick_count::now()};
+  mMinimizePairsSeconds += (t2-t1).seconds();
 
   return n_new_pairs;
 }
 
+#if 0
+// testing mathic and mathicgb routines...
+class TestPairQueueConfiguration
+{
+private:
+  // What should be here?
+  
+public:
+  TestPairQueueConfiguration(const gb_array& gb,
+                     XXX
+                     );
+  using PairData = MonomialInfo::OrderedMonomial;
+  void computePairData(
+                       size_t col,
+                       size_t row,
+                       PairData & m, // allocated space?
+                       ) const;
+
+  using CompareResult = bool;
+  bool compare(
+               size_t colA,
+               size_t rowA,
+               MonomialInfo::ConstOrderedMonomial a,
+               size_t colB,
+               size_t rowB,
+               MonomialInfo::ConstOrderedMonomial b) const
+  {
+    // What to change this test code to?
+    const auto cmp = orderMonoid().compare(*a, *b);
+    if (cmp == GT)
+      return true;
+    if (cmp == LT)
+      return false;
+      
+    const bool aRetired = mBasis.retired(rowA) || mBasis.retired(colA);
+    const bool bRetired = mBasis.retired(rowB) || mBasis.retired(colB);
+    if (aRetired || bRetired)
+      return !bRetired;
+      
+    if (mPreferSparseSPairs) {
+      const auto termCountA =
+        mBasis.basisElement(colA).termCount() +
+        mBasis.basisElement(rowA).termCount();
+      const auto termCountB =
+        mBasis.basisElement(colB).termCount() +
+        mBasis.basisElement(rowB).termCount();
+      if (termCountA > termCountB)
+        return true;
+      if (termCountA < termCountB)
+        return false;
+    }
+    return colA + rowA > colB + rowB;
+  }
+
+  bool cmpLessThan(bool v) const {return v;}
+};
+
+class TestSPairs
+{
+private:
+  mathic::PairQueue<TestPairQueueConfiguration> mPairQueue;
+
+public:
+  TestSPairs(gb_poly& currentGroebnerBasis);
+
+  ~TestSPairs() {} // anything here?
+
+};
+#endif
+
 // Local Variables:
 //  compile-command: "make -C $M2BUILDDIR/Macaulay2/e "
 //  indent-tabs-mode: nil
diff --git a/M2/Macaulay2/e/f4/f4-spairs.hpp b/M2/Macaulay2/e/f4/f4-spairs.hpp
index 1855828ec9d..5104edc6e79 100644
--- a/M2/Macaulay2/e/f4/f4-spairs.hpp
+++ b/M2/Macaulay2/e/f4/f4-spairs.hpp
@@ -1,21 +1,18 @@
 // Copyright 2004-2021 Michael E. Stillman
 
-#ifndef __f4spairs_h_
-#define __f4spairs_h_
+#pragma once
 
+#include <queue>                     // for std::priority_queue
+#include "MemoryBlock.hpp"           // for MemoryBlock
 #include "f4/f4-types.hpp"           // for spair (ptr only), gb_array, pre_...
 #include "f4/memblock.hpp"           // for F4MemoryBlock
 #include "f4/moninfo.hpp"            // for MonomialInfo (ptr only), packed_...
 #include "f4/varpower-monomial.hpp"  // for varpower_word
-#include "newdelete.hpp"             // for our_new_delete
-class stash;
 
-class F4SPairSet : public our_new_delete
+class F4SPairSet
 {
  private:
-  int determine_next_degree(int &result_number);
-
-  spair *make_spair(spair_type type,
+  spair *make_spair(SPairType type,
                     int deg,
                     int i,
                     int j);  // CAUTION: lcm is allocated correctly, but is
@@ -31,8 +28,12 @@ class F4SPairSet : public our_new_delete
   int construct_pairs(bool remove_disjoints);
 
  public:
-  F4SPairSet(const MonomialInfo *MI0, const gb_array &gb0);
-
+#if defined(WITH_TBB)
+  F4SPairSet(const MonomialInfo *MI0, const gb_array &gb0,mtbb::task_arena& scheduler);
+#else
+  F4SPairSet(const MonomialInfo *MI0, const gb_array &gb0);  
+#endif
+  
   ~F4SPairSet();
 
   void insert_generator(int deg, packed_monomial lcm, int column);
@@ -45,16 +46,20 @@ class F4SPairSet : public our_new_delete
   int find_new_pairs(bool remove_disjoints);
   // returns the number of new pairs found, using the last element on this list
 
-  int prepare_next_degree(int max, int &result_number);
+  std::pair<bool,int> setThisDegree();
+  
+  // int prepare_next_degree(int max, int &result_number);
   // Returns the (sugar) degree being done next, and collects all (or at
   // most 'max', if max>0) spairs in this lowest degree.
   // Returns the degree, sets result_number.
   // These spairs are not sorted in any way.  Should they be?
 
-  spair *get_next_pair();
+  std::pair<bool,spair> get_next_pair();
   // get the next pair in this degree (the one 'prepare_next_degree' set up')
-  // returns 0 if at the end
 
+  // discard all of the spairs in the current set degree
+  void discardSPairsInCurrentDegree();
+  
   void display_spair(spair *p);
   // A debugging routine which displays an spair
 
@@ -65,22 +70,37 @@ class F4SPairSet : public our_new_delete
   // Returns how many pairs were created, then later removed due to
   // spair criteria.
 
+  size_t numberOfSPairs() const { return mSPairs.size(); }
+
+  double secondsToMinimizePairs() const { return mMinimizePairsSeconds; }
+  double secondsToCreatePrePairs() const { return mPrePairsSeconds; }
  private:
   F4MemoryBlock<pre_spair> PS;      // passed to constructor routine
   F4MemoryBlock<varpower_word> VP;  // used for constructing new pairs
+  MemoryBlock mSPairLCMs;  // used for spair lcms  
+  
   int max_varpower_size;
 
   const MonomialInfo *M;
   const gb_array &gb;
-  spair *heap;  // list of pairs
-  spair *this_set;
-  stash *spair_stash;
+
+  std::vector<spair> mSPairs;
+  SPairCompare mSPairCompare;
+  std::priority_queue<size_t,std::vector<size_t>,SPairCompare> mSPairQueue;
+
+  long mThisDegree;
 
   // stats
   long nsaved_unneeded;
-};
+  double mMinimizePairsSeconds;
+  double mPrePairsSeconds;
 
+#if defined (WITH_TBB)
+  mtbb::task_arena& mScheduler;
 #endif
+  
+};
+
 
 // Local Variables:
 //  compile-command: "make -C $M2BUILDDIR/Macaulay2/e "
diff --git a/M2/Macaulay2/e/f4/f4-types.hpp b/M2/Macaulay2/e/f4/f4-types.hpp
index aa923ebbdea..aa6acb38326 100644
--- a/M2/Macaulay2/e/f4/f4-types.hpp
+++ b/M2/Macaulay2/e/f4/f4-types.hpp
@@ -4,6 +4,7 @@
 #define _F4types_h_
 
 
+#include <climits>                   // for INT_MIN
 #include "VectorArithmetic.hpp"      // for ElementArray
 #include "f4/f4-monlookup.hpp"       // for F4MonomialLookupTableT
 #include "f4/moninfo.hpp"            // for MonomialInfo, monomial_word, pac...
@@ -24,41 +25,66 @@ enum gbelem_type {
 };
 
 enum spair_type {
-  F4_SPAIR_SPAIR,
-  F4_SPAIR_GCD_ZZ,
-  F4_SPAIR_RING,
-  F4_SPAIR_SKEW,
-  F4_SPAIR_GEN,
-  F4_SPAIR_ELEM
+  F4_SPAIR_SPAIR,        // arising from an honest spair
+  F4_SPAIR_GCD_ZZ,       // for gbs over the integers
+  F4_SPAIR_RING,         // an spair between a generator and a gen of the defining ideal
+  F4_SPAIR_SKEW,         // from exterior variables times a monomial
+  F4_SPAIR_GEN,          // a generator of the defining ideal
+  F4_SPAIR_ELEM          // 
 };
 
-struct GBF4Polynomial : public our_new_delete
+enum class SPairType {
+  SPair,
+  Generator,
+  Retired
+  // later we would also like GCDZZ, Ring, Skew to handle those cases as well
+};
+
+struct GBF4Polynomial 
 {
   int len;
   ElementArray coeffs;
   monomial_word *monoms;  // This is all of the monomials written contiguously
 };
 
-struct pre_spair : public our_new_delete
+struct pre_spair 
 {
-  enum spair_type type;
+  //enum spair_type type;
+  SPairType type;
   int deg1;  // simple degree of quot
   varpower_monomial quot;
   int j;
   bool are_disjoint;
 };
 
-struct spair : public our_new_delete
+// old spair type (linked list-style container)
+//struct spair
+//{
+//  spair *next;
+//  spair_type type;
+//  int deg; /* sugar degree of this spair */
+//  int i;
+//  int j;
+//  monomial_word lcm[1];  // packed_monomial
+//};
+
+struct spair
 {
-  spair *next;
-  spair_type type;
+public:
+  // spair *next;
+  SPairType type;
   int deg; /* sugar degree of this spair */
   int i;
   int j;
-  monomial_word lcm[1];  // packed_monomial
+  // monomial_word lcm[1];  // packed_monomial
+  monomial_word* lcm;  // pointer to a monomial space
+  
+  spair() : type(SPairType::Retired),deg(INT_MIN),i(-1),j(-1),lcm(nullptr) {}
+  spair(SPairType t, int deg0, int i0, int j0, monomial_word* lcm0) :
+    type(t), deg(deg0), i(i0), j(j0), lcm(lcm0) {}
 };
 
-struct gbelem : public our_new_delete
+struct gbelem
 {
   GBF4Polynomial f;
   int deg;
@@ -66,16 +92,17 @@ struct gbelem : public our_new_delete
   gbelem_type minlevel;
 };
 
-typedef VECTOR(gbelem *) gb_array;
+// typedef VECTOR(gbelem *) gb_array;
+typedef std::vector<gbelem *> gb_array;
 
-struct sparse_row : public our_new_delete
+struct sparse_row
 {
   int len;
   ElementArray coeffs;
   int *comps; // of length len, allocated in a memory block.
 };
 
-struct row_elem : public our_new_delete
+struct row_elem
 {
   // Header information
   packed_monomial monom; // pointer, allocated monomial in a memory block
@@ -84,20 +111,22 @@ struct row_elem : public our_new_delete
   // The polynomial itself
   int len;
   ElementArray coeffs;
-  int *comps; // of length len, allocated in a memory block.
+  int *comps; // of length len, no longer allocated in a memory block, but with new[]
 };
 
-struct column_elem : public our_new_delete
+struct column_elem
 {
   packed_monomial monom; // pointer, allocated monomial in a memory block
   int head;  // which row is being used as a pivot for this column.
              // -1 means none, -2 means not set yet
 };
 
-struct coefficient_matrix : public our_new_delete
+struct coefficient_matrix
 {
-  typedef VECTOR(row_elem) row_array;
-  typedef VECTOR(column_elem) column_array;
+  //typedef VECTOR(row_elem) row_array;
+  //typedef VECTOR(column_elem) column_array;
+  typedef std::vector<row_elem> row_array;
+  typedef std::vector<column_elem> column_array;
 
   row_array rows;
   column_array columns;
@@ -187,6 +216,26 @@ class PreSPairSorter
   ~PreSPairSorter() {}
 };
 
+class SPairCompare
+{
+public:
+  SPairCompare(const std::vector<spair> &spairs) : mSPairs(spairs) { }
+
+public:
+  bool operator()(size_t s, size_t t) const
+  {
+    const spair& a = mSPairs[s];
+    const spair& b = mSPairs[t];
+    if (a.deg > b.deg) return true;
+    if (a.deg < b.deg) return false;
+    if (a.i > b.i) return true;
+    return false;
+  }
+
+private:
+  const std::vector<spair>& mSPairs;
+};
+
 typedef F4MonomialLookupTableT<int32_t> MonomialLookupTable;
 
 #endif
diff --git a/M2/Macaulay2/e/f4/f4.cpp b/M2/Macaulay2/e/f4/f4.cpp
index a29744fd9f7..113b556c6a4 100644
--- a/M2/Macaulay2/e/f4/f4.cpp
+++ b/M2/Macaulay2/e/f4/f4.cpp
@@ -7,7 +7,6 @@
 #include "buffer.hpp"                  // for buffer
 #include "error.h"                     // for error
 #include "f4/f4-m2-interface.hpp"      // for F4toM2Interface
-#include "f4/f4-mem.hpp"               // for F4Mem, F4Vec
 #include "f4/f4-spairs.hpp"            // for F4SPairSet
 #include "f4/f4-types.hpp"             // for row_elem, coefficient_matrix
 #include "f4/hilb-fcn.hpp"             // for HilbertController
@@ -26,11 +25,13 @@
 #include <cstdio>                      // for fprintf, stderr
 #include <algorithm>                   // for stable_sort
 #include <vector>                      // for swap, vector
+#include <atomic>                      // for atomic ints in gauss_reduce
+#include <strstream>
+#include <iostream>
 
 class RingElement;
 
 F4GB::F4GB(const VectorArithmetic* VA,
-           F4Mem *Mem0,
            const MonomialInfo *M0,
            const FreeModule *F0,
            M2_bool collect_syz,
@@ -38,10 +39,11 @@ F4GB::F4GB(const VectorArithmetic* VA,
            M2_arrayint weights0,
            int strategy,
            M2_bool use_max_degree,
-           int max_degree)
+           int max_degree,
+           int numThreads)
     : mVectorArithmetic(VA),
-      M(M0),
-      F(F0),
+      mMonomialInfo(M0),
+      mFreeModule(F0),
       weights(weights0),
       component_degrees(nullptr),  // need to put this in
       n_pairs_computed(0),
@@ -52,31 +54,42 @@ F4GB::F4GB(const VectorArithmetic* VA,
       this_degree(),
       is_ideal(F0->rank() == 1),
       hilbert(nullptr),
-      gens(),
-      gb(),
-      lookup(nullptr),
-      S(nullptr),
+      mGenerators(),
+      mGroebnerBasis(),
+      mLookupTable(mMonomialInfo->n_vars()),
       next_col_to_process(0),
       mat(nullptr),
-      H(M0, 17),
-      B(),
+      mMonomialHashTable(M0, 17),
+      mMonomialMemoryBlock(),
       next_monom(),
-      Mem(Mem0),
       clock_sort_columns(0),
       clock_gauss(0),
-      clock_make_matrix(0)
+      mGaussTime(0),
+      mParallelGaussTime(0),
+      mSerialGaussTime(0),
+      mTailReduceTime(0),
+      mNewSPairTime(0),
+      mInsertGBTime(0),
+      clock_make_matrix(0),
+      mNumThreads(mtbb::numThreads(numThreads)),
+#if defined(WITH_TBB)
+      mScheduler(mNumThreads),
+      mSPairSet(mMonomialInfo, mGroebnerBasis, mScheduler)
+#else
+      mSPairSet(mMonomialInfo, mGroebnerBasis)
+#endif
 {
-  lookup = new MonomialLookupTable(M->n_vars());
-  S = new F4SPairSet(M, gb);
+  //  mLookupTable = new MonomialLookupTable(mMonomialInfo->n_vars());
+  //  mSPairSet = new F4SPairSet(mMonomialInfo, mGroebnerBasis);
   mat = new coefficient_matrix;
 
   // TODO: set status?
-  if (M2_gbTrace >= 2) M->show();
+  if (M2_gbTrace >= 3) mMonomialInfo->show();
 }
 
 void F4GB::set_hilbert_function(const RingElement *hf)
 {
-  hilbert = new HilbertController(F, hf); // TODO MES: GC problem?
+  hilbert = new HilbertController(mFreeModule, hf); // TODO MES: GC problem?
 }
 
 void F4GB::delete_gb_array(gb_array &g)
@@ -92,22 +105,22 @@ void F4GB::delete_gb_array(gb_array &g)
 
 F4GB::~F4GB()
 {
-  delete S;
-  delete lookup;
+  //  delete mSPairSet;
+  // delete mLookupTable;
   delete mat;
 
   // Now delete the gens, gb arrays.
-  delete_gb_array(gens);
-  delete_gb_array(gb);
+  delete_gb_array(mGenerators);
+  delete_gb_array(mGroebnerBasis);
 }
 
 void F4GB::new_generators(int lo, int hi)
 {
   for (int i = lo; i <= hi; i++)
     {
-      gbelem *g = gens[i];
+      gbelem *g = mGenerators[i];
       if (g->f.len == 0) continue;
-      S->insert_generator(g->deg, g->f.monoms, i);
+      mSPairSet.insert_generator(g->deg, g->f.monoms, i);
     }
 }
 
@@ -129,11 +142,11 @@ int F4GB::new_column(packed_monomial m)
 int F4GB::find_or_append_column(packed_monomial m)
 {
   packed_monomial new_m;
-  if (H.find_or_insert(m, new_m)) return static_cast<int>(new_m[-1]);
+  if (mMonomialHashTable.find_or_insert(m, new_m)) return static_cast<int>(new_m[-1]);
   // At this point, m is a new monomial to be placed as a column
   m = next_monom;
-  B.intern(1 + M->monomial_size(m));
-  next_monom = B.reserve(1 + M->max_monomial_size());
+  mMonomialMemoryBlock.intern(1 + mMonomialInfo->monomial_size(m));
+  next_monom = mMonomialMemoryBlock.reserve(1 + mMonomialInfo->max_monomial_size());
   next_monom++;
   return new_column(m);
 }
@@ -146,34 +159,36 @@ int F4GB::mult_monomials(packed_monomial m, packed_monomial n)
   // If it is there, return its column
   // If not, increment our memory block, and insert a new column.
   packed_monomial new_m;
-  M->unchecked_mult(m, n, next_monom);
-  if (H.find_or_insert(next_monom, new_m))
+  mMonomialInfo->unchecked_mult(m, n, next_monom);
+  if (mMonomialHashTable.find_or_insert(next_monom, new_m))
     return static_cast<int>(
         new_m[-1]);  // monom exists, don't save monomial space
   m = next_monom;
-  B.intern(1 + M->monomial_size(m));
-  next_monom = B.reserve(1 + M->max_monomial_size());
+  mMonomialMemoryBlock.intern(1 + mMonomialInfo->monomial_size(m));
+  next_monom = mMonomialMemoryBlock.reserve(1 + mMonomialInfo->max_monomial_size());
   next_monom++;
   return new_column(m);
 }
 
 void F4GB::load_gen(int which)
 {
-  GBF4Polynomial &g = gens[which]->f;
+  GBF4Polynomial &g = mGenerators[which]->f;
 
   row_elem r{};
   r.monom = nullptr;  // This says that this element corresponds to a generator
   r.elem = which;
   r.len = g.len;
-  r.comps = Mem->components.allocate(g.len);
+  // r.comps = Mem->components.allocate(g.len);
+  r.comps = new int[g.len];
   // r.coeffs is already initialized to [nullptr].
 
+  // TODO: this iterator requires knowledge about memory layout of monomials
   monomial_word *w = g.monoms;
   for (int i = 0; i < g.len; i++)
     {
-      M->copy(w, next_monom);
+      mMonomialInfo->copy(w, next_monom);
       r.comps[i] = find_or_append_column(next_monom);
-      w += M->monomial_size(w);
+      w += mMonomialInfo->monomial_size(w);
     }
 
   mat->rows.push_back(r);
@@ -181,20 +196,22 @@ void F4GB::load_gen(int which)
 
 void F4GB::load_row(packed_monomial monom, int which)
 {
-  GBF4Polynomial &g = gb[which]->f;
+  GBF4Polynomial &g = mGroebnerBasis[which]->f;
 
   row_elem r{};
   r.monom = monom;
   r.elem = which;
   r.len = g.len;
-  r.comps = Mem->components.allocate(g.len);
+  // r.comps = Mem->components.allocate(g.len);
+  r.comps = new int[g.len];
   // r.coeffs is already initialized to [nullptr].
-  
+
+  // TODO: this iterator requires knowledge about memory layout of monomials  
   monomial_word *w = g.monoms;
   for (int i = 0; i < g.len; i++)
     {
       r.comps[i] = mult_monomials(monom, w);
-      w += M->monomial_size(w);
+      w += mMonomialInfo->monomial_size(w);
     }
 
   mat->rows.push_back(r);
@@ -211,13 +228,13 @@ void F4GB::process_column(int c)
   column_elem &ce = mat->columns[c];
   if (ce.head >= -1) return;
   int32_t which;
-  bool found = lookup->find_one_divisor_packed(M, ce.monom, which);
+  bool found = mLookupTable.find_one_divisor_packed(mMonomialInfo, ce.monom, which);
   if (found)
     {
       packed_monomial n = next_monom;
-      M->unchecked_divide(ce.monom, gb[which]->f.monoms, n);
-      B.intern(1 + M->monomial_size(n));
-      next_monom = B.reserve(1 + M->max_monomial_size());
+      mMonomialInfo->unchecked_divide(ce.monom, mGroebnerBasis[which]->f.monoms, n);
+      mMonomialMemoryBlock.intern(1 + mMonomialInfo->monomial_size(n));
+      next_monom = mMonomialMemoryBlock.reserve(1 + mMonomialInfo->max_monomial_size());
       next_monom++;
       // M->set_component(which, n);
       ce.head = INTSIZE(mat->rows);
@@ -227,21 +244,35 @@ void F4GB::process_column(int c)
     ce.head = -1;
 }
 
+void F4GB::loadSPairRow(spair *p)
+{
+   packed_monomial n = next_monom;
+   mMonomialInfo->unchecked_divide(p->lcm, mGroebnerBasis[p->i]->f.monoms, n);
+   mMonomialMemoryBlock.intern(1 + mMonomialInfo->monomial_size(n));
+   next_monom = mMonomialMemoryBlock.reserve(1 + mMonomialInfo->max_monomial_size());
+   next_monom++;
+   load_row(n, p->i);
+}
+
+void F4GB::loadReducerRow(spair *p)
+{
+   packed_monomial n = next_monom;
+   mMonomialInfo->unchecked_divide(p->lcm, mGroebnerBasis[p->j]->f.monoms, n);
+   mMonomialMemoryBlock.intern(1 + mMonomialInfo->monomial_size(n));
+   next_monom = mMonomialMemoryBlock.reserve(1 + mMonomialInfo->max_monomial_size());
+   next_monom++;
+   load_row(n, p->j);
+}
+
 void F4GB::process_s_pair(spair *p)
 {
   int c;
 
   switch (p->type)
     {
-      case F4_SPAIR_SPAIR:
+    case SPairType::SPair:
         {
-          packed_monomial n = next_monom;
-          M->unchecked_divide(p->lcm, gb[p->i]->f.monoms, n);
-          B.intern(1 + M->monomial_size(n));
-          next_monom = B.reserve(1 + M->max_monomial_size());
-          next_monom++;
-
-          load_row(n, p->i);
+          loadSPairRow(p);
           c = mat->rows[mat->rows.size() - 1].comps[0];
 
           if (mat->columns[c].head >= -1)
@@ -249,21 +280,15 @@ void F4GB::process_s_pair(spair *p)
           else
             {
               // In this situation, we load the other half as a reducer
-              n = next_monom;
-              M->unchecked_divide(p->lcm, gb[p->j]->f.monoms, n);
-              B.intern(1 + M->monomial_size(n));
-              next_monom = B.reserve(1 + M->max_monomial_size());
-              next_monom++;
-              load_row(n, p->j);
-
+              loadReducerRow(p);
               mat->columns[c].head = INTSIZE(mat->rows) - 1;
             }
           break;
         }
-      case F4_SPAIR_GEN:
+    case SPairType::Generator:
         load_gen(p->i);
         break;
-      default:
+    case SPairType::Retired:
         break;
     }
 }
@@ -284,10 +309,12 @@ void F4GB::reorder_columns()
 
   // sort the columns
 
-  int *column_order = Mem->components.allocate(ncols);
-  int *ord = Mem->components.allocate(ncols);
+  //int *column_order = Mem->components.allocate(ncols);
+  //int *ord = Mem->components.allocate(ncols);
+  int *column_order = new int[ncols];
+  int *ord = new int[ncols];
 
-  ColumnsSorter C(M, mat);
+  ColumnsSorter C(mMonomialInfo, mat);
 
 // Actual sort of columns /////////////////
 
@@ -346,8 +373,10 @@ void F4GB::reorder_columns()
     }
 
   std::swap(mat->columns, newcols);
-  Mem->components.deallocate(column_order);
-  Mem->components.deallocate(ord);
+  delete [] column_order;
+  delete [] ord;
+  //Mem->components.deallocate(column_order);
+  //Mem->components.deallocate(ord);
 }
 
 void F4GB::reorder_rows()
@@ -355,7 +384,7 @@ void F4GB::reorder_rows()
   int nrows = INTSIZE(mat->rows);
   int ncols = INTSIZE(mat->columns);
   coefficient_matrix::row_array newrows;
-  VECTOR(long) rowlocs;  // 0..nrows-1 of where row as been placed
+  std::vector<long> rowlocs;  // 0..nrows-1 of where row as been placed
 
   newrows.reserve(nrows);
   rowlocs.reserve(nrows);
@@ -383,7 +412,7 @@ void F4GB::reset_matrix()
 {
   // mat
   next_col_to_process = 0;
-  next_monom = B.reserve(1 + M->max_monomial_size());
+  next_monom = mMonomialMemoryBlock.reserve(1 + mMonomialInfo->max_monomial_size());
   next_monom++;
 }
 
@@ -392,23 +421,22 @@ void F4GB::clear_matrix()
   // Clear the rows first
   for (auto & r : mat->rows)
     {
+      //      std::cout << "at A" << std::endl;
       if (not r.coeffs.isNull())
         mVectorArithmetic->deallocateElementArray(r.coeffs);
-      Mem->components.deallocate(r.comps);
+      // Mem->components.deallocate(r.comps);
+      //      std::cout << "at B" << std::endl;
+      if (r.comps != nullptr)
+        delete [] r.comps;
+      //      std::cout << "at C" << std::endl;
       r.len = 0;
       r.elem = -1;
       r.monom = nullptr;
     }
   mat->rows.clear();
   mat->columns.clear();
-  H.reset();
-  B.reset();
-  if (M2_gbTrace >= 4)
-    {
-      Mem->components.show();
-      Mem->coefficients.show();
-      Mem->show();
-    }
+  mMonomialHashTable.reset();
+  mMonomialMemoryBlock.reset();
 }
 
 void F4GB::make_matrix()
@@ -419,10 +447,13 @@ void F4GB::make_matrix()
      Is this the best order to do it in?  Maybe not...
   */
 
-  spair *p;
-  while ((p = S->get_next_pair()))
+  //spair *p;
+  std::pair<bool,spair> p;
+  p = mSPairSet.get_next_pair();
+  while (p.first)
     {
-      process_s_pair(p);
+      process_s_pair(&p.second);
+      p = mSPairSet.get_next_pair();
     }
 
   while (next_col_to_process < mat->columns.size())
@@ -444,8 +475,83 @@ void F4GB::make_matrix()
   reorder_columns();
 
   // reorder_rows();  // This is only here so we can see what we are doing...?
+
+  // For debugging: let's compute the number of non-zero elements above the diagonal in A matrix.
+  // Also, let's compute the amount of space used for:
+  //  column info
+  //  row info
+  //  sum of values in each row
+  // Maybe so A B C D matrices separately.
+  macaulayMatrixStats();
+}
+
+auto F4GB::macaulayMatrixStats() const -> MacaulayMatrixStats
+{
+  // Want:
+  //  sizes of A, B, C, D.
+  //  number of elements in each part.
+  // loop through the rows, and for each row, determine: in A or C?
+  //   loop through elements in a row: for each, determine: in A/C or B/D
+
+  MacaulayMatrixStats stats;
+
+  // look at 'mat', determine number of rows, cols, etc.
+  long nrows = mat->rows.size();
+  long ncols = mat->columns.size();
+  for (long i=0; i<nrows; ++i)
+    {
+      bool is_pivot = is_pivot_row(i);
+      if (is_pivot)
+        ++stats.mTopAndLeft;
+
+      auto& r = mat->rows[i];
+      for (long j=0; j < r.len; ++j)
+          {
+            auto c = r.comps[j];
+            bool is_left = mat->columns[c].head >= 0;
+            if (is_pivot and is_left) ++stats.mAEntries;
+            if (is_pivot and not is_left) ++stats.mBEntries;
+            if (not is_pivot and is_left) ++stats.mCEntries;
+            if (not is_pivot and not is_left) ++stats.mDEntries;
+          }
+    }
+
+  stats.mAEntries -= stats.mTopAndLeft;
+  stats.mBottom = nrows - stats.mTopAndLeft;
+  stats.mRight = ncols - stats.mTopAndLeft;
+
+  stats.display();
+  return stats;
+}
+
+void F4GB::MacaulayMatrixStats::display(buffer& o)
+{
+  std::ostringstream s;
+  
+  s << "Quad matrix sizes" << std::endl;
+  s << "sizes of quad matrix" << std::endl;
+  s << std::setw(11) << " " << std::setw(11) << mTopAndLeft << std::setw(11) << mRight << std::endl;
+  s << std::setw(11) << mTopAndLeft << std::setw(11) << "A" << std::setw(11) << "B" << std::endl;
+  s << std::setw(11) << mBottom << std::setw(11) << "C" << std::setw(11) << "D" << std::endl;
+  s << std::endl;
+
+  s << "Quad matrix entries (no diagona on A)" << std::endl;
+  s << std::setw(11) << " " << std::setw(11) << mTopAndLeft << std::setw(11) << mRight << std::endl;
+  s << std::setw(11) << mTopAndLeft << std::setw(11) << mAEntries << std::setw(11) << mBEntries << std::endl;
+  s << std::setw(11) << mBottom << std::setw(11) << mCEntries << std::setw(11) <<  mDEntries << std::endl;
+  s << std::endl;
+  
+  o << s.str().c_str();
 }
 
+void F4GB::MacaulayMatrixStats::display()
+{
+  buffer o;
+  display(o);
+  emit(o.str());
+}
+
+
 ///////////////////////////////////////////////////
 // Gaussian elimination ///////////////////////////
 ///////////////////////////////////////////////////
@@ -457,8 +563,8 @@ const ElementArray& F4GB::get_coeffs_array(row_elem &r)
 
   // At this point, we must go find the coeff array
   if (r.monom == nullptr)  // i.e. a generator
-    return gens[r.elem]->f.coeffs;
-  return gb[r.elem]->f.coeffs;
+    return mGenerators[r.elem]->f.coeffs;
+  return mGroebnerBasis[r.elem]->f.coeffs;
 }
 
 bool F4GB::is_new_GB_row(int row) const
@@ -482,27 +588,138 @@ void F4GB::gauss_reduce(bool diagonalize)
   int ncols = INTSIZE(mat->columns);
 
   int n_newpivots = -1;  // the number of new GB elements in this degree
-  int n_zero_reductions = 0;
+  std::atomic<long> n_zero_reductions = 0;
+
+  mtbb::tick_count t0;
+  mtbb::tick_count t1;
+
   if (hilbert)
     {
       n_newpivots = hilbert->nRemainingExpected();
-      if (n_newpivots == 0) return;
+      if (n_newpivots == 0)
+        {
+          std::cout << "Oh crap, our logic is wrong: should not get here if no new GB elements expected in this degree" << std::endl;
+          return;
+        }
     }
 
+#if defined(WITH_TBB)
+//#if 0
+
+  std::vector<int> spair_rows;
+  for (int i = 0; i < nrows; ++i)
+    {
+      if (not is_pivot_row(i))
+        spair_rows.push_back(i);
+    }
+  
+  std::mutex cout_guard;
+  t0 = mtbb::tick_count::now();
+  using threadLocalDense_t = mtbb::enumerable_thread_specific<ElementArray>;
+  // create a dense array for each thread
+  threadLocalDense_t threadLocalDense([&]() { 
+    return mVectorArithmetic->allocateElementArray(ncols);
+  });
+  std::cout << "mNumThreads: " << mNumThreads << std::endl;
+  mScheduler.execute([&] {
+    mtbb::parallel_for(mtbb::blocked_range<int>{0, INTSIZE(spair_rows)},
+                       [&](const mtbb::blocked_range<int>& r)
+                       {
+                         // long actual_reductions = 0;
+                         //  for (auto i = r.begin(); i != r.end(); ++i)
+                         //    {
+                         //      if (not is_pivot_row(i))
+                         //        ++ actual_reductions;
+                         //    }
+
+                         //  cout_guard.lock();
+                         //  std::cout << "#reductions to do: " << actual_reductions << std::endl;
+                         //  cout_guard.unlock();
+
+                         threadLocalDense_t::reference my_dense = threadLocalDense.local();
+                         for (auto i = r.begin(); i != r.end(); ++i)
+                         {
+                            // these lines are commented out to avoid the hilbert hint for now...
+                            //if ((not hilbert) or (n_newpivots > 0))
+                            //{
+                               bool newNonzeroReduction = gauss_reduce_row(spair_rows[i], my_dense);
+                               if (not newNonzeroReduction) ++n_zero_reductions;
+                            //   if (hilbert && newNonzeroReduction)
+                            //     --n_newpivots;
+                            //}
+                         }
+                       });
+  });
+  for (auto tlDense : threadLocalDense)
+    mVectorArithmetic->deallocateElementArray(tlDense);
+  t1 = mtbb::tick_count::now();
+  mParallelGaussTime += (t1-t0).seconds();
+#endif 
+
+  std::cout << "About to do serial loop, n_newpivots = " << n_newpivots << std::endl;
+  t0 = mtbb::tick_count::now();
   ElementArray gauss_row { mVectorArithmetic->allocateElementArray(ncols) };
-  for (int i = 0; i < nrows; i++)
+  for (auto i = 0; i < spair_rows.size(); i++)
     {
-      row_elem &r = mat->rows[i];
-      if (r.len == 0) continue;  // could happen once we include syzygies...
+      auto this_row = spair_rows[i];
+      if ((not hilbert) or (n_newpivots > 0))
+      {
+         bool newNonzeroReduction = gauss_reduce_row(this_row, gauss_row);
+         if (newNonzeroReduction)
+         {
+            row_elem &r = mat->rows[this_row];
+            mVectorArithmetic->makeMonic(r.coeffs);
+            mat->columns[r.comps[0]].head = this_row;
+            if (hilbert) --n_newpivots;
+         }
+         else
+            ++n_zero_reductions;
+      } else if (hilbert)
+        {
+          // Inform code that we don't want a new GB element from this row.
+          // The row will be deleted with clear_matrix.
+          row_elem &r = mat->rows[this_row];
+          r.len = 0;
+        }
+    }
+  mVectorArithmetic->deallocateElementArray(gauss_row);
+  t1 = mtbb::tick_count::now();
+  mSerialGaussTime += (t1-t0).seconds();
+
+  if (M2_gbTrace >= 3)
+    fprintf(stderr, "-- #zeroreductions %ld\n", n_zero_reductions.load());
+
+  t0 = mtbb::tick_count::now();
+  if (diagonalize) tail_reduce();
+  t1 = mtbb::tick_count::now();
+  mTailReduceTime += (t1-t0).seconds();
+}
+
+bool F4GB::is_pivot_row(int index) const
+{
+  row_elem &r = mat->rows[index];
+  int pivotcol = r.comps[0];
+  int pivotrow = mat->columns[pivotcol].head;
+  return (pivotrow == index);
+}
+
+bool F4GB::gauss_reduce_row(int index,
+                            ElementArray& gauss_row)
+{
+   // returns true if the row reduces to a "new" nonzero row
+   // returns false otherwise
+      row_elem &r = mat->rows[index];
+      if (r.len == 0) return false;  // could happen once we include syzygies...
+      if (is_pivot_row(index)) return false;
+
       int pivotcol = r.comps[0];
       int pivotrow = mat->columns[pivotcol].head;
-      if (pivotrow == i) continue;  // this is a pivot row, so leave it alone
 
       const ElementArray& rcoeffs = get_coeffs_array(r);
       n_pairs_computed++;
       mVectorArithmetic->fillDenseArray(gauss_row, rcoeffs, Range(r.comps, r.comps + r.len));
 
-      int firstnonzero = ncols;
+      int firstnonzero = -1;
       int first = r.comps[0];
       int last = r.comps[r.len - 1];
       do
@@ -521,18 +738,23 @@ void F4GB::gauss_reduce(bool diagonalize)
               int last1 = pivot_rowelem.comps[pivot_rowelem.len - 1];
               if (last1 > last) last = last1;
             }
-          else if (firstnonzero == ncols)
+          else if (firstnonzero == -1)
             firstnonzero = first;
           first = mVectorArithmetic->denseNextNonzero(gauss_row, first+1, last);
         }
-      while (first <= last);
+      while (first <= last);  // end do
+
       if (not r.coeffs.isNull())
         mVectorArithmetic->deallocateElementArray(r.coeffs);
-      Mem->components.deallocate(r.comps);
+      //Mem->components.deallocate(r.comps);
+      delete [] r.comps;
+      r.comps = nullptr;
       r.len = 0;
       //Range<int> monomRange;
       //mVectorArithmetic->denseToSparse(gauss_row, r.coeffs, monomRange, firstnonzero, last, mComponentSpace);
-      mVectorArithmetic->denseToSparse(gauss_row, r.coeffs, r.comps, firstnonzero, last, Mem->components);
+      //mVectorArithmetic->denseToSparse(gauss_row, r.coeffs, r.comps, firstnonzero, last, Mem->components);
+      mVectorArithmetic->denseToSparse(gauss_row, r.coeffs, r.comps, firstnonzero, last);
+
       // TODO set r.comps from monomRange.  Question: who has allocated this space??
       // Maybe for now it is just the following line:
       r.len = mVectorArithmetic->size(r.coeffs);
@@ -545,22 +767,9 @@ void F4GB::gauss_reduce(bool diagonalize)
       // the above line leaves gauss_row zero, and also handles the case when
       // r.len is 0 (TODO: check this!!)
       // it also potentially frees the old r.coeffs and r.comps (TODO: ?? r.comps??)
-      if (r.len > 0)
-        {
-          mVectorArithmetic->makeMonic(r.coeffs);
-          mat->columns[r.comps[0]].head = i;
-          if (--n_newpivots == 0) break;
-        }
-      else
-        n_zero_reductions++;
-    }
-  mVectorArithmetic->deallocateElementArray(gauss_row);
-
-  if (M2_gbTrace >= 3)
-    fprintf(stderr, "-- #zeroreductions %d\n", n_zero_reductions);
-
-  if (diagonalize) tail_reduce();
+      return (r.len > 0);
 }
+                            
 
 void F4GB::tail_reduce()
 {
@@ -601,7 +810,8 @@ void F4GB::tail_reduce()
         };
       if (anychange)
         {
-          Mem->components.deallocate(r.comps);
+          //Mem->components.deallocate(r.comps);
+          delete [] r.comps;
           mVectorArithmetic->deallocateElementArray(r.coeffs);
           r.len = 0;
           //Range<int> monomRange;
@@ -610,11 +820,15 @@ void F4GB::tail_reduce()
           //                                       monomRange,
           //                                       firstnonzero, last,
           //                                       mComponentSpace);
+          // mVectorArithmetic->denseToSparse(gauss_row,
+          //                                        r.coeffs,
+          //                                        r.comps,
+          //                                        firstnonzero, last,
+          //                                        Mem->components);
           mVectorArithmetic->denseToSparse(gauss_row,
                                                  r.coeffs,
                                                  r.comps,
-                                                 firstnonzero, last,
-                                                 Mem->components);
+                                                 firstnonzero, last);
           r.len = mVectorArithmetic->size(r.coeffs);
           //r.len = monomRange.size();
           //r.comps = Mem->components.allocate(r.len);
@@ -648,7 +862,7 @@ void F4GB::insert_gb_element(row_elem &r)
   //  insert the monomial into the lookup table
   //  find new pairs associated to this new element
 
-  int nslots = M->max_monomial_size();
+  int nslots = mMonomialInfo->max_monomial_size();
   int nlongs = r.len * nslots;
 
   gbelem *result = new gbelem;
@@ -670,39 +884,50 @@ void F4GB::insert_gb_element(row_elem &r)
     {
       result->f.coeffs.swap(r.coeffs);
     }
-  result->f.monoms = Mem->allocate_monomial_array(nlongs);
+  
+  //result->f.monoms = Mem->allocate_monomial_array(nlongs);
+  result->f.monoms = new monomial_word[nlongs];
 
   monomial_word *nextmonom = result->f.monoms;
   for (int i = 0; i < r.len; i++)
     {
-      M->copy(mat->columns[r.comps[i]].monom, nextmonom);
+      mMonomialInfo->copy(mat->columns[r.comps[i]].monom, nextmonom);
       nextmonom += nslots;
     }
-  Mem->components.deallocate(r.comps);
+  // Mem->components.deallocate(r.comps);
+  delete [] r.comps;
+  r.comps = nullptr;
   r.len = 0;
   result->deg = this_degree;
-  result->alpha = static_cast<int>(M->last_exponent(result->f.monoms));
+  result->alpha = static_cast<int>(mMonomialInfo->last_exponent(result->f.monoms));
   result->minlevel = ELEM_MIN_GB;  // MES: How do
   // we distinguish between ELEM_MIN_GB, ELEM_POSSIBLE_MINGEN?
 
-  int which = INTSIZE(gb);
-  gb.push_back(result);
+  int which = INTSIZE(mGroebnerBasis);
+  mGroebnerBasis.push_back(result);
 
   if (hilbert)
     {
       int x;
-      int *exp = newarray_atomic(int, M->n_vars());
-      M->to_intstar_vector(result->f.monoms, exp, x);
+      //int *exp = newarray_atomic(int, M->n_vars());
+      int *exp = new int[mMonomialInfo->n_vars()];
+      mMonomialInfo->to_intstar_vector(result->f.monoms, exp, x);
       hilbert->addMonomial(exp, x + 1);
-      freemem(exp);
+      delete [] exp;
+      //freemem(exp);
     }
   // now insert the lead monomial into the lookup table
-  varpower_monomial vp = newarray_atomic(varpower_word, 2 * M->n_vars() + 1);
-  M->to_varpower_monomial(result->f.monoms, vp);
-  lookup->insert_minimal_vp(M->get_component(result->f.monoms), vp, which);
-  freemem(vp);
+  //varpower_monomial vp = newarray_atomic(varpower_word, 2 * M->n_vars() + 1);
+  varpower_monomial vp = new varpower_word[2 * mMonomialInfo->n_vars() + 1];
+  mMonomialInfo->to_varpower_monomial(result->f.monoms, vp);
+  mLookupTable.insert_minimal_vp(mMonomialInfo->get_component(result->f.monoms), vp, which);
+  delete [] vp;
+  //freemem(vp);
   // now go forth and find those new pairs
-  S->find_new_pairs(is_ideal);
+  mtbb::tick_count t0 = mtbb::tick_count::now();
+  mSPairSet.find_new_pairs(is_ideal);
+  mtbb::tick_count t1 = mtbb::tick_count::now();
+  mNewSPairTime += (t1-t0).seconds();
 }
 
 void F4GB::new_GB_elements()
@@ -721,6 +946,7 @@ void F4GB::new_GB_elements()
      then
      we don't need to loop through all of these */
 
+  mtbb::tick_count t0 = mtbb::tick_count::now();
   for (int r = 0; r < mat->rows.size(); r++)
     {
       if (is_new_GB_row(r))
@@ -728,6 +954,8 @@ void F4GB::new_GB_elements()
           insert_gb_element(mat->rows[r]);
         }
     }
+  mtbb::tick_count t1 = mtbb::tick_count::now();
+  mInsertGBTime += (t1-t0).seconds();
 }
 
 ///////////////////////////////////////////////////
@@ -738,6 +966,7 @@ void F4GB::do_spairs()
 {
   if (hilbert && hilbert->nRemainingExpected() == 0)
     {
+      mSPairSet.discardSPairsInCurrentDegree();
       if (M2_gbTrace >= 1)
         fprintf(stderr,
                 "-- skipping degree...no elements expected in this degree\n");
@@ -749,7 +978,7 @@ void F4GB::do_spairs()
   n_lcmdups = 0;
   make_matrix();
 
-  if (M2_gbTrace >= 5)
+  if (M2_gbTrace >= 6)
     {
       fprintf(stderr, "---------\n");
       show_matrix();
@@ -762,7 +991,13 @@ void F4GB::do_spairs()
   nsecs /= CLOCKS_PER_SEC;
   if (M2_gbTrace >= 2) fprintf(stderr, " make matrix time = %f\n", nsecs);
 
-  if (M2_gbTrace >= 2) H.dump();
+  if (M2_gbTrace >= 3) mMonomialHashTable.dump();
+
+  double oldParallelGauss = mParallelGaussTime;
+  double oldSerialGauss = mSerialGaussTime;
+  double oldTailReduce = mTailReduceTime;
+  double oldNewSPair = mNewSPairTime;
+  double oldInsertNewGB = mInsertGBTime;
 
   begin_time = clock();
   gauss_reduce(true);
@@ -778,9 +1013,12 @@ void F4GB::do_spairs()
   if (M2_gbTrace >= 2)
     {
       fprintf(stderr, " gauss time          = %f\n", nsecs);
+      fprintf(stderr, " parallel gauss time          = %g\n", mParallelGaussTime - oldParallelGauss);
+      fprintf(stderr, " serial gauss time            = %g\n", mSerialGaussTime - oldSerialGauss);
+      fprintf(stderr, " tail reduce time             = %g\n", mTailReduceTime - oldTailReduce);
 
       fprintf(stderr, " lcm dups            = %ld\n", n_lcmdups);
-      if (M2_gbTrace >= 5)
+      if (M2_gbTrace >= 6)
         {
           fprintf(stderr, "---------\n");
           show_matrix();
@@ -788,11 +1026,18 @@ void F4GB::do_spairs()
         }
     }
   new_GB_elements();
-  int ngb = INTSIZE(gb);
+  fprintf(stderr, " finding new spair time             = %g\n",  mNewSPairTime - oldNewSPair);
+  fprintf(stderr, " number of spairs in queue          = %zu\n", mSPairSet.numberOfSPairs());
+  fprintf(stderr, " insert new gb time                 = %g\n",  mInsertGBTime - oldInsertNewGB - (mNewSPairTime - oldNewSPair));
+  
+  int ngb = INTSIZE(mGroebnerBasis);
   if (M2_gbTrace >= 1)
     {
       fprintf(stderr, " # GB elements   = %d\n", ngb);
-      if (M2_gbTrace >= 5) show_gb_array(gb);
+      if (M2_gbTrace >= 5) {
+        show_gb_array(mGroebnerBasis);
+        mSPairSet.display();
+      }
     }
 
   clear_matrix();
@@ -801,7 +1046,7 @@ void F4GB::do_spairs()
 enum ComputationStatusCode F4GB::computation_is_complete(StopConditions &stop_)
 {
   // This handles everything but stop_.always, stop_.degree_limit
-  if (stop_.basis_element_limit > 0 && gb.size() >= stop_.basis_element_limit)
+  if (stop_.basis_element_limit > 0 && mGroebnerBasis.size() >= stop_.basis_element_limit)
     return COMP_DONE_GB_LIMIT;
   if (stop_.pair_limit > 0 && n_pairs_computed >= stop_.pair_limit)
     return COMP_DONE_PAIR_LIMIT;
@@ -826,13 +1071,13 @@ void F4GB::test_spair_code()
   // gb matrix, and then calling find_new_pairs after each one.
   // It displays the list of spairs after each generator.
 
-  gb.push_back(gens[0]);
-  for (int i = 1; i < gens.size(); i++)
+  mGroebnerBasis.push_back(mGenerators[0]);
+  for (int i = 1; i < mGenerators.size(); i++)
     {
-      gb.push_back(gens[i]);
-      S->find_new_pairs(false);
+      mGroebnerBasis.push_back(mGenerators[i]);
+      mSPairSet.find_new_pairs(false);
       fprintf(stderr, "---Just inserted element %d---\n", i);
-      S->display();
+      mSPairSet.display();
     }
 }
 
@@ -841,7 +1086,7 @@ enum ComputationStatusCode F4GB::start_computation(StopConditions &stop_)
   clock_sort_columns = 0;
   clock_gauss = 0;
   clock_make_matrix = 0;
-  int npairs;
+  int npairs = 0;
 
   //  test_spair_code();
 
@@ -858,13 +1103,17 @@ enum ComputationStatusCode F4GB::start_computation(StopConditions &stop_)
       is_done = computation_is_complete(stop_);
       if (is_done != COMP_COMPUTING) break;
 
-      this_degree = S->prepare_next_degree(-1, npairs);
-
-      if (npairs == 0)
+      //this_degree = mSPairSet.prepare_next_degree(-1, npairs);
+      auto thisDegInfo = mSPairSet.setThisDegree();
+      
+      if (not thisDegInfo.first)
         {
           is_done = COMP_DONE;
           break;
         }
+
+      this_degree = thisDegInfo.second;
+      
       if (stop_.stop_after_degree && this_degree > stop_.degree_limit->array[0])
         {
           is_done = COMP_DONE_DEGREE_LIMIT;
@@ -914,18 +1163,39 @@ enum ComputationStatusCode F4GB::start_computation(StopConditions &stop_)
       fprintf(stderr,
               "total time for gauss: %f\n",
               ((double)clock_gauss) / CLOCKS_PER_SEC);
+      fprintf(stderr,
+              "parallel tbb time for gauss: %g\n",
+              mParallelGaussTime);
+      fprintf(stderr,
+              "serial tbb time for gauss: %g\n",
+              mSerialGaussTime);
+      fprintf(stderr,
+              "total time for finding new spairs: %g\n",
+              mNewSPairTime);
+      fprintf(stderr,
+              "total time for creating pre spairs: %g\n",
+              mSPairSet.secondsToCreatePrePairs());
+      fprintf(stderr,
+              "total time for minimizing new spairs: %g\n",
+              mSPairSet.secondsToMinimizePairs());
+      fprintf(stderr,
+              "total time for inserting new gb elements: %g\n",
+              mInsertGBTime);
       fprintf(stderr,
               "number of spairs computed           : %ld\n",
               n_pairs_computed);
       fprintf(stderr,
               "number of reduction steps           : %ld\n",
               n_reduction_steps);
+      if (M2_gbTrace >= 3)
+      {
+        fprintf(stderr,
+              "number of spairs removed by criterion = %ld\n",
+              mSPairSet.n_unneeded_pairs());
+        mMonomialInfo->show();
+      }
     }
 
-  fprintf(stderr,
-          "number of spairs removed by criterion = %ld\n",
-          S->n_unneeded_pairs());
-  M->show();
   return is_done;
 }
 
@@ -942,10 +1212,10 @@ void F4GB::show_gb_array(const gb_array &g) const
   for (int i = 0; i < g.size(); i++)
     {
       vec v = F4toM2Interface::to_M2_vec(
-          mVectorArithmetic, M, g[i]->f, F);
+          mVectorArithmetic, mMonomialInfo, g[i]->f, mFreeModule);
       o << "element " << i << " degree " << g[i]->deg << " alpha "
         << g[i]->alpha << newline << "    ";
-      F->get_ring()->vec_text_out(o, v);
+      mFreeModule->get_ring()->vec_text_out(o, v);
       o << newline;
     }
   emit(o.str());
@@ -960,7 +1230,7 @@ void F4GB::show_row_info() const
       if (mat->rows[i].monom == nullptr)
         fprintf(stderr, "generator");
       else
-        M->show(mat->rows[i].monom);
+        mMonomialInfo->show(mat->rows[i].monom);
       fprintf(stderr, "\n");
     }
 }
@@ -971,7 +1241,7 @@ void F4GB::show_column_info() const
   for (int i = 0; i < mat->columns.size(); i++)
     {
       fprintf(stderr, "head %4d monomial ", mat->columns[i].head);
-      M->show(mat->columns[i].monom);
+      mMonomialInfo->show(mat->columns[i].monom);
       fprintf(stderr, "\n");
     }
 }
@@ -980,7 +1250,7 @@ void F4GB::show_matrix()
 {
   // Debugging routine
   MutableMatrix *q = F4toM2Interface::to_M2_MutableMatrix(
-      mVectorArithmetic, mat, gens, gb);
+      mVectorArithmetic, mat, mGenerators, mGroebnerBasis);
   buffer o;
   q->text_out(o);
   emit(o.str());
diff --git a/M2/Macaulay2/e/f4/f4.hpp b/M2/Macaulay2/e/f4/f4.hpp
index 761bc28740e..e73cce057da 100644
--- a/M2/Macaulay2/e/f4/f4.hpp
+++ b/M2/Macaulay2/e/f4/f4.hpp
@@ -71,15 +71,15 @@
 #include "engine-exports.h"         // for M2_arrayint, M2_bool
 #include "f4-types.hpp"             // for gb_array, MonomialLookupTable
 #include "f4/moninfo.hpp"           // for packed_monomial, MonomialInfo
+#include "f4/f4-spairs.hpp"         // For F4SPairSet
 #include "interface/computation.h"  // for ComputationStatusCode, StopCondit...
+#include "m2tbb.hpp"                // for TBB
 #include "memblock.hpp"             // for F4MemoryBlock
 #include "monhashtable.hpp"         // for MonomialHashTable
 #include "newdelete.hpp"            // for our_new_delete
 #include "MemoryBlock.hpp"          // for MemoryBlock<T>
 #include <ctime>                    // for clock, CLOCKS_PER_SEC, clock_t
 
-class F4Mem;
-class F4SPairSet;
 class FreeModule;
 class HilbertController;
 class RingElement;
@@ -91,8 +91,8 @@ class F4GB : public our_new_delete
 {
   // Basic required information
   const VectorArithmetic* mVectorArithmetic;
-  const MonomialInfo *M;
-  const FreeModule *F;
+  const MonomialInfo *mMonomialInfo;
+  const FreeModule *mFreeModule;
   M2_arrayint weights;  // The length of this is the number of variables, each
                         // entry is positive.
   M2_arrayint component_degrees;  // Degree of each free module element.
@@ -114,27 +114,65 @@ class F4GB : public our_new_delete
   // Monomial order information.  Should this be in M?
 
   // The main players in the computation
-  gb_array gens;
-  gb_array gb;
-  MonomialLookupTable *lookup;  // (monom,comp) --> index into gb
-  F4SPairSet *S;
+  gb_array mGenerators;
+  gb_array mGroebnerBasis;
+  MonomialLookupTable mLookupTable;  // (monom,comp) --> index into mGroebnerBasis
 
   // The matrix and its construction
   int next_col_to_process;
   coefficient_matrix *mat;
-  MonomialHashTable<MonomialInfo> H;
-  F4MemoryBlock<monomial_word> B;
+  MonomialHashTable<MonomialInfo> mMonomialHashTable;
+  F4MemoryBlock<monomial_word> mMonomialMemoryBlock;
   monomial_word *next_monom;  // valid while creating the matrix
   
-  F4Mem *Mem;  // Used to allocate and deallocate arrays used in the matrix
-  MemoryBlock mComponentSpace; // stop-gap for use with VectorArithmetic and Mem.
+  MemoryBlock mComponentSpace; // stop-gap for use with VectorArithmetic and Mem. (TODO: what does this comment mean?)
 
   // cumulative timing info
   double clock_sort_columns;
   clock_t clock_gauss;
+  double mGaussTime;
+  double mParallelGaussTime;
+  double mSerialGaussTime;
+  double mTailReduceTime;
+  double mNewSPairTime;
+  double mInsertGBTime;
   clock_t clock_make_matrix;
 
- private:
+  struct MacaulayMatrixStats
+  {
+  public:
+    // #rows/cols
+    long mTopAndLeft = 0;
+    long mBottom = 0;
+    long mRight = 0;
+    
+    // #entries
+    long mAEntries = 0; // but not the diagonals?
+    long mBEntries = 0;
+    long mCEntries = 0;
+    long mDEntries = 0;
+    
+    // memory usage: column info, row info, all the entries, hash table?
+    long mColumnInfo = 0;
+    long mRowInfo = 0;
+    long mEntries = 0;
+ 
+    void display(buffer& o);
+    void display();
+  };
+
+  MacaulayMatrixStats macaulayMatrixStats() const;  // For debugging info
+  
+#if defined (WITH_TBB)
+  int mNumThreads;
+  mtbb::task_arena mScheduler;
+
+  mtbb::task_arena& getScheduler() { return mScheduler; }
+#endif
+
+  F4SPairSet mSPairSet;
+
+private:
   ////////////////////////////////////////////////////////////////////
   void delete_gb_array(gb_array &g);
 
@@ -154,7 +192,11 @@ class F4GB : public our_new_delete
   void load_gen(int which);
   void load_row(packed_monomial monom, int which);
   void process_column(int c);
+
+  void loadSPairRow(spair *p);
+  void loadReducerRow(spair *p);
   void process_s_pair(spair *p);
+
   void reorder_columns();
   void reorder_rows();
 
@@ -168,7 +210,10 @@ class F4GB : public our_new_delete
   // place
   // and also to determine if an element (row) needs to be tail reduced
 
+  bool is_pivot_row(int index) const;
+  
   void gauss_reduce(bool diagonalize);
+  bool gauss_reduce_row(int index, ElementArray& gauss_row);
   void tail_reduce();
 
   void row_to_dense_row(int r, int &first, int &last);
@@ -182,7 +227,6 @@ class F4GB : public our_new_delete
 
  public:
   F4GB(const VectorArithmetic* VA,
-       F4Mem *Mem0,
        const MonomialInfo *MI,
        const FreeModule *F,  // used for debugging only...
        M2_bool collect_syz,
@@ -190,7 +234,8 @@ class F4GB : public our_new_delete
        M2_arrayint gb_weights,
        int strategy,
        M2_bool use_max_degree,
-       int max_degree);
+       int max_degree,
+       int numThreads);
 
   ~F4GB();
 
@@ -199,10 +244,10 @@ class F4GB : public our_new_delete
 
   void new_generators(int lo, int hi);
 
-  const gb_array &get_generators() const { return gens; }
-  gb_array &get_generators() { return gens; }
-  const gb_array &get_gb() const { return gb; }
-  gb_array &get_gb() { return gb; }
+  const gb_array &get_generators() const { return mGenerators; }
+  gb_array &get_generators() { return mGenerators; }
+  const gb_array &get_gb() const { return mGroebnerBasis; }
+  gb_array &get_gb() { return mGroebnerBasis; }
   void set_hilbert_function(const RingElement *hf);
 
   enum ComputationStatusCode start_computation(StopConditions &stop_);
diff --git a/M2/Macaulay2/e/f4/hilb-fcn.hpp b/M2/Macaulay2/e/f4/hilb-fcn.hpp
index 183c29522c5..b045ea1bee5 100644
--- a/M2/Macaulay2/e/f4/hilb-fcn.hpp
+++ b/M2/Macaulay2/e/f4/hilb-fcn.hpp
@@ -12,7 +12,7 @@ class MatrixConstructor;
 class PolynomialRing;
 class RingElement;
 
-class HilbertController
+class HilbertController : public our_new_delete
 {
  public:
   HilbertController(const FreeModule *F0, const RingElement *hf);
diff --git a/M2/Macaulay2/e/f4/memblock.hpp b/M2/Macaulay2/e/f4/memblock.hpp
index f834837855a..2c9a7dfefac 100644
--- a/M2/Macaulay2/e/f4/memblock.hpp
+++ b/M2/Macaulay2/e/f4/memblock.hpp
@@ -71,7 +71,7 @@ template <typename T, long int NSLAB>
 typename F4MemoryBlock<T, NSLAB>::slab *F4MemoryBlock<T, NSLAB>::new_slab()
 {
   slab *result = new slab;
-  result->next = 0;
+  result->next = nullptr;
   return result;
 }
 
@@ -87,9 +87,9 @@ T *F4MemoryBlock<T, NSLAB>::reserve(int len)
 {
   if (next_free + len > current_slab->block + NSLAB)
     {
-      if (current_slab->next == 0)
+      if (current_slab->next == nullptr)
         {
-          last_slab->next = new slab;
+          last_slab->next = new_slab();
           last_slab = last_slab->next;
           current_slab = last_slab;
         }
diff --git a/M2/Macaulay2/e/f4/moninfo.cpp b/M2/Macaulay2/e/f4/moninfo.cpp
index e3ab48ebaca..b79baefd0ce 100644
--- a/M2/Macaulay2/e/f4/moninfo.cpp
+++ b/M2/Macaulay2/e/f4/moninfo.cpp
@@ -10,7 +10,7 @@
 MonomialInfo::MonomialInfo(int nvars0, const MonomialOrdering *mo)
 {
   nvars = nvars0;
-  hashfcn = newarray_atomic(monomial_word, nvars);
+  hashfcn = new monomial_word[nvars];
   for (int i = 0; i < nvars; i++) hashfcn[i] = rand();
   mask = 0x10000000;
 
@@ -55,7 +55,12 @@ MonomialInfo::MonomialInfo(int nvars0, const MonomialOrdering *mo)
   firstvar = 2 + nweights;
 }
 
-MonomialInfo::~MonomialInfo() { freemem(hashfcn); }
+MonomialInfo::~MonomialInfo()
+{ 
+  delete [] hashfcn; 
+  //freemem(hashfcn);
+}
+
 monomial_word MonomialInfo::monomial_weight(const_packed_monomial m,
                                             const M2_arrayint wts) const
 {
diff --git a/M2/Macaulay2/e/gb-f4/GBF4Interface.cpp b/M2/Macaulay2/e/gb-f4/GBF4Interface.cpp
index 46a1ccbc145..2cf0ee6fbeb 100644
--- a/M2/Macaulay2/e/gb-f4/GBF4Interface.cpp
+++ b/M2/Macaulay2/e/gb-f4/GBF4Interface.cpp
@@ -9,17 +9,19 @@
 // TODO: Fix int/Strategy discrepancy.  Make a ComputationStrategy type in util.hpp or comp-gb.hpp?
 auto createGBF4Interface(const Matrix *inputMatrix,
                          const std::vector<int>& variableWeights, // what is this, do we need it?
-                         int strategy // do we need this?
+                         int strategy, // do we need this?
+                         int numThreads
                          ) -> GBComputation*
 {
-  return newf4::createGBF4Interface(inputMatrix, variableWeights, newf4::Strategy::Normal);  
+  return newf4::createGBF4Interface(inputMatrix, variableWeights, newf4::Strategy::Normal, numThreads);  
 }
 
 namespace newf4 {
 
 auto createGBF4Interface(const Matrix *inputMatrix,
                          const std::vector<int>& variableWeights, // what is this, do we need it?
-                         Strategy strategy // do we need this?
+                         Strategy strategy, // do we need this?
+                         int numThreads
                          ) -> GBComputation*
 {
   const PolynomialRing* R = inputMatrix->get_ring()->cast_to_PolynomialRing();
@@ -29,7 +31,8 @@ auto createGBF4Interface(const Matrix *inputMatrix,
   auto C = new GBF4Interface(R,
                              inputMatrix,
                              variableWeights,
-                             strategy);
+                             strategy,
+                             numThreads);
   return C;
 }
 
@@ -37,7 +40,8 @@ auto createGBF4Interface(const Matrix *inputMatrix,
 GBF4Interface::GBF4Interface(const PolynomialRing* originalRing,
                              const Matrix* inputMatrix,
                              const std::vector<int>& variableWeights,
-                             Strategy strategy
+                             Strategy strategy,
+                             int numThreads
                              )
     : mOriginalRing(originalRing),
       mFreeModule(inputMatrix->rows()),
@@ -56,7 +60,8 @@ GBF4Interface::GBF4Interface(const PolynomialRing* originalRing,
                              const FreeModule* freeModule,
                              const BasicPolyList& basicPolyList,
                              const std::vector<int>& variableWeights,
-                             Strategy strategy
+                             Strategy strategy,
+                             int numThreads
                              )
     : mOriginalRing(originalRing),
       mFreeModule(freeModule),
diff --git a/M2/Macaulay2/e/gb-f4/GBF4Interface.hpp b/M2/Macaulay2/e/gb-f4/GBF4Interface.hpp
index 7d624164489..be55b99a304 100644
--- a/M2/Macaulay2/e/gb-f4/GBF4Interface.hpp
+++ b/M2/Macaulay2/e/gb-f4/GBF4Interface.hpp
@@ -10,7 +10,8 @@ class Matrix;
 
 auto createGBF4Interface(const Matrix *inputMatrix,
                          const std::vector<int>& variableWeights, // what is this, do we need it?
-                         int strategy
+                         int strategy,
+                         int numThreads
                          ) -> GBComputation*;
 
 namespace newf4 {
@@ -21,7 +22,8 @@ enum class Strategy { Normal };
 
 auto createGBF4Interface(const Matrix *inputMatrix,
                          const std::vector<int>& variableWeights, // what is this, do we need it?
-                         Strategy strategy // do we need this?
+                         Strategy strategy, // do we need this?
+                         int numThreads
                          ) -> GBComputation*;
 void populateComputation(const Matrix* M, GBF4Computation& C);
   
@@ -43,14 +45,16 @@ class GBF4Interface : public GBComputation
   GBF4Interface(const PolynomialRing* originalRing,
                 const Matrix* inputMatrix,
                 const std::vector<int>& variableWeights,
-                Strategy strategy
+                Strategy strategy,
+                int numThreads
                 );
 
   GBF4Interface(const PolynomialRing* originalRing,
                 const FreeModule* freeModule,
                 const BasicPolyList& basicPolyList,
                 const std::vector<int>& variableWeights,
-                Strategy strategy
+                Strategy strategy,
+                int numThreads
                 );
 
   ~GBF4Interface() override;
diff --git a/M2/Macaulay2/e/gb-walk.cpp b/M2/Macaulay2/e/gb-walk.cpp
index e0dd383a4e5..25696952039 100644
--- a/M2/Macaulay2/e/gb-walk.cpp
+++ b/M2/Macaulay2/e/gb-walk.cpp
@@ -94,7 +94,8 @@ GBComputation *GBWalker::make_gb(const Matrix *M) const
                                                false,
                                                -1,
                                                0,
-                                               0
+                                               0,
+                                               0 // TBB numThreads
                                                /* , max_reduction_count */
                                                );
   G0->set_stop_conditions(false,
diff --git a/M2/Macaulay2/e/interface/groebner.cpp b/M2/Macaulay2/e/interface/groebner.cpp
index e8eda3aca38..276cb58719c 100644
--- a/M2/Macaulay2/e/interface/groebner.cpp
+++ b/M2/Macaulay2/e/interface/groebner.cpp
@@ -103,6 +103,7 @@ Computation /* or null */ *IM2_GB_make(
     {
       test_over_RR_or_CC(m->get_ring());
       clear_emit_size();
+      int numThreads = M2_numTBBThreads; // settable from front end.
       return GBComputation::choose_gb(m,
                                       collect_syz,
                                       n_rows_to_keep,
@@ -111,6 +112,7 @@ Computation /* or null */ *IM2_GB_make(
                                       max_degree,
                                       algorithm,
                                       strategy,
+                                      numThreads,
                                       max_reduction_count);
   } catch (const exc::engine_error& e)
     {
diff --git a/M2/Macaulay2/e/polyquotient.cpp b/M2/Macaulay2/e/polyquotient.cpp
index a9952decd37..9f1e2019eda 100644
--- a/M2/Macaulay2/e/polyquotient.cpp
+++ b/M2/Macaulay2/e/polyquotient.cpp
@@ -220,7 +220,8 @@ GBComputation *PolyRingQuotient::make_gb(const ring_elem g) const
                                               false,
                                               -1,
                                               0,
-                                              0
+                                              0,
+                                              0 // TBB numThreads
                                               /* , max_reduction_count */
                                               );
   G->set_stop_conditions(false,
@@ -395,7 +396,8 @@ void PolyRingQuotient::syzygy(const ring_elem a,
                                               false,
                                               -1,
                                               0,
-                                              0
+                                              0,
+                                              0 // TBB numThreads
                                               /* , max_reduction_count */
                                               );
   G->set_stop_conditions(false,
diff --git a/M2/Macaulay2/e/schreyer-resolution/res-f4.cpp b/M2/Macaulay2/e/schreyer-resolution/res-f4.cpp
index 135b8452e56..292290156b5 100644
--- a/M2/Macaulay2/e/schreyer-resolution/res-f4.cpp
+++ b/M2/Macaulay2/e/schreyer-resolution/res-f4.cpp
@@ -581,15 +581,6 @@ void F4Res::gaussReduce()
   bool onlyConstantMaps = false;
   std::vector<bool> track(mReducers.size());
 
-#if defined(WITH_TBB)
-  //std::cout << "about to do parallel gauss_reduce" << std::endl;
-  using threadLocalDense_t = mtbb::enumerable_thread_specific<ElementArray>;
-  // create a dense array for each thread
-  threadLocalDense_t threadLocalDense([&]() { 
-    return mRing.vectorArithmetic().allocateElementArray(static_cast<ComponentIndex>(mColumns.size()));
-  });
-#endif
-  
   if (onlyConstantMaps)  // and not exterior algebra?
     {
       for (auto i = 0; i < mReducers.size(); i++)
@@ -601,6 +592,14 @@ void F4Res::gaussReduce()
         }
     }
 
+#if defined(WITH_TBB)
+  //std::cout << "about to do parallel gauss_reduce" << std::endl;
+  using threadLocalDense_t = mtbb::enumerable_thread_specific<ElementArray>;
+  // create a dense array for each thread
+  threadLocalDense_t threadLocalDense([&]() { 
+    return mRing.vectorArithmetic().allocateElementArray(static_cast<ComponentIndex>(mColumns.size()));
+  });
+  
   // Reduce to zero every spair. Recording creates the
   // corresponding syzygy, which is auto-reduced and correctly ordered.
 
@@ -609,7 +608,6 @@ void F4Res::gaussReduce()
   //  std::cout << "gauss_row size: " << mRing.vectorArithmetic().size(gauss_row) <<
   //  std::endl;
 
-#if defined(WITH_TBB)
   //size_t chunk_size = std::max(mSPairs.size() / (100*mFrame.getNumThreads()), (size_t) 1);
   //mFrame.getScheduler().execute([this,&chunk_size,&onlyConstantMaps,&track,&threadLocalDense] {
   mFrame.getScheduler().execute([this,&onlyConstantMaps,&track,&threadLocalDense] {
diff --git a/M2/Macaulay2/packages/undistributed-packages/ExampleFreeResolutions.m2 b/M2/Macaulay2/packages/undistributed-packages/ExampleFreeResolutions.m2
index e802f81557e..52805b5520b 100644
--- a/M2/Macaulay2/packages/undistributed-packages/ExampleFreeResolutions.m2
+++ b/M2/Macaulay2/packages/undistributed-packages/ExampleFreeResolutions.m2
@@ -1018,7 +1018,7 @@ peek M.cache#(ResolutionContext{}).Result.Resolution.RawComputation
 -- Some benchmark examples for testing parallelism of Schreyer res code.
 
   -- I = Grassmannian(2, 5, CoefficientRing => ZZ/101)
-  -- I = Grassmannian(1, 6, CoefficientRing => ZZ/101)
+  I = Grassmannian(1, 6, CoefficientRing => ZZ/101)
 
   I = Grassmannian(1, 7, CoefficientRing => ZZ/101)
   debug Core
@@ -1048,3 +1048,15 @@ peek M.cache#(ResolutionContext{}).Result.Resolution.RawComputation
   elapsedTime freeResolution(ideal I_*, Strategy => Nonminimal)
   elapsedTime C = freeResolution(ideal I_*, Strategy => Nonminimal, ParallelizeByDegree => true)
 
+  restart
+  I = Grassmannian(2, 6, CoefficientRing => ZZ/101)
+  poincI = poincare ideal I_*
+  R = ring I
+  elapsedTime gens gb(ideal I_*, Algorithm => LinearAlgebra, Hilbert => poincI);
+  
+  restart
+  R = ZZ/101[a..g]
+  I = ideal random(R^1,R^{4:-4});
+  poincI = poincare ideal (a^4, b^4, c^4, d^4);
+  elapsedTime Igb = gens gb(ideal I_*, Algorithm => LinearAlgebra);
+  elapsedTime gens gb(ideal I_*, Algorithm => LinearAlgebra, Hilbert => poincI);  
diff --git a/M2/Macaulay2/packages/undistributed-packages/ExampleIdeals/gb-ideals.m2 b/M2/Macaulay2/packages/undistributed-packages/ExampleIdeals/gb-ideals.m2
index 32deb634f3c..4e3ccd359e3 100644
--- a/M2/Macaulay2/packages/undistributed-packages/ExampleIdeals/gb-ideals.m2
+++ b/M2/Macaulay2/packages/undistributed-packages/ExampleIdeals/gb-ideals.m2
@@ -40,7 +40,8 @@ J1=ideal"dgjm-chjm-dfkm+bhkm+cflm-bglm-dgin+chin+dekn-ahkn-celn+agln+dfio-bhio-d
   JMPS-INPS-JLQS+HNQS+ILRS-HMRS-JMOT+INOT+JKQT-GNQT-IKRT+GMRT+JLOU-HNOU-JKPU+GNPU+HKRU-GLRU-ILOV+HMOV+IKPV-GMPV-HKQV+GLQV";
 J1
 -*
-  time gb(ideal J1_*, Algorithm=>LinearAlgebra); -- 69.9 sec
+  gbTrace = 2
+  elapsedTime gb(ideal J1_*, Algorithm=>LinearAlgebra); -- 69.9 sec
     -- Mike M1 max,  std::variant VectorArithmetic: 18.0 sec, 18.46 sec
     -- Mike M1 max,  virtual VectorArithmetic: 17.88 sec, 19.06 sec
 
@@ -65,7 +66,7 @@ R1 = kk[a..g, MonomialSize=>8];
 J1 = ideal random(R1^1, R1^{-5,-5,-5,-6});
 J1
 -*     
-  gbTrace=1
+  gbTrace=2
   time gb(ideal J1_*, Algorithm=>LinearAlgebra); -- 54.1 sec (53.25 sec, 58.7 sec, MBP 7/17/09)
     -- Mike M1 max,  std::variant VectorArithmetic: 14.29 sec, 14.3  sec
     -- Mike M1 max,  virtual VectorArithmetic: 14.43 sec, 14.28  sec
@@ -639,3 +640,14 @@ I = sub(ideal last coefficients F, R)
 gbTrace=3
 time gens gb(I, DegreeLimit=>m*n);
 ----------------------------------------------
+
+--random5566
+restart
+kk = ZZ/101;
+R1 = kk[a..g, MonomialSize=>8];
+J1 = ideal random(R1^1, R1^{-5,-5,-6,-6});
+-*     
+  gbTrace=2
+  elapsedTime gens gb(ideal J1_*, Algorithm=>LinearAlgebra);   -- 26.3  M1 Mac Mini + Zoom; 16s gauss
+  elapsedTime groebnerBasis(ideal J1_*, Strategy=>"F4");  -- 6.5s  M1 Mac Mini + Zoom
+*-