diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9dfadb29..699da1a7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -31,7 +31,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install wheel setuptools
-        python -m pip install numpy scipy matplotlib bumps periodictable scikit-learn pytest pytest-cov
+        python -m pip install numpy scipy matplotlib bumps periodictable scikit-learn pytest pytest-cov numba
         python setup.py build_ext --inplace
         python setup.py build
         mkdir release
@@ -97,8 +97,6 @@ jobs:
         echo "SRC_DIST=$(ls dist/*.tar.gz)" >> $GITHUB_ENV
         echo "WINDOWS_WHL=$(ls dist/*win*.whl)" >> $GITHUB_ENV
         echo "MACOS_WHL=$(ls dist/*macos*.whl)" >> $GITHUB_ENV
-        echo "WINDOWS_LATEST_WHL=$(ls unstable/*win*.whl)" >> $GITHUB_ENV
-        echo "MACOS_LATEST_WHL=$(ls unstable/*macos*.whl)" >> $GITHUB_ENV
 
     - name: Update unstable release
       uses: johnwbyrd/update-release@v1.0.0
@@ -106,7 +104,7 @@ jobs:
         token: ${{ secrets.GITHUB_TOKEN }}
         release: unstable
         tag: sid
-        files: unstable/Refl1D-windows-exe-latest.zip ${{ env.WINDOWS_LATEST_WHL }} ${{ env.MACOS_LATEST_WHL }}
+        files: unstable/Refl1D-windows-exe-latest.zip
 
     - name: Update current release
       if: startsWith(github.ref, 'refs/tags')
diff --git a/refl1d/lib/calc_g_zs_cex.c b/refl1d/lib/calc_g_zs_cex.c
deleted file mode 100644
index d6e63e4c..00000000
--- a/refl1d/lib/calc_g_zs_cex.c
+++ /dev/null
@@ -1,256 +0,0 @@
-// Can't use Py_LIMITED_API with numpy arrays
-//#define Py_LIMITED_API 0x03020000
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <Python.h>
-#include <numpy/arrayobject.h>
-
-
-PyObject *Pcalc_g_zs(PyObject *self, PyObject *args)
-{
-    PyArrayObject *g_z_pao, *c_i_pao, *g_zs_pao;
-    double lambda_0, lambda_1, cval;
-    double *g_z, *c_i, *g_zs, *g_zs_next;
-    Py_ssize_t segments, layers, r, z, i;
-    
-    if (!PyArg_ParseTuple(args, "O!O!O!ddnn",
-                            &PyArray_Type,&g_z_pao,
-                            &PyArray_Type,&c_i_pao,
-                            &PyArray_Type,&g_zs_pao, 
-                            &lambda_0,
-                            &lambda_1,
-                            &layers,
-                            &segments))
-    {return NULL;}
-
-    g_z=(double *) PyArray_DATA(g_z_pao);
-    c_i=(double *) PyArray_DATA(c_i_pao);
-    g_zs=(double *) PyArray_DATA(g_zs_pao);
-    g_zs_next = g_zs + layers;
-
-    for (r=1; r<segments; ++r) {
-        cval = c_i[segments-r-1];
-        i = layers*(r-1); // Precalculating this index gave a 30% speedup
-
-        //g_zs[i+layers] = ... switching to a pointer accounted for 14%
-        *g_zs_next++ = g_z[0] * (
-            g_zs[i] * lambda_0
-            + g_zs[1+i] * lambda_1
-            + cval
-        );
-
-        for (z=1; z<(layers-1); ++z) {
-            *g_zs_next++ = g_z[z] * (
-                g_zs[z+i-1] * lambda_1
-                + g_zs[z+i] * lambda_0
-                + g_zs[z+i+1] * lambda_1
-                + cval
-            );
-        }
-
-        *g_zs_next++ = g_z[z] * (
-            g_zs[z+i] * lambda_0
-            + g_zs[z+i-1] * lambda_1
-            + cval
-        );
-
-    }   
-
-    Py_RETURN_NONE;
-}
-
-// 15% speedup for uniform distributions
-PyObject *Pcalc_g_zs_uniform(PyObject *self, PyObject *args)
-{
-    PyArrayObject *g_z_pao, *g_zs_pao;
-    double lambda_0, lambda_1;
-    double *g_z, *g_zs, *g_zs_next;
-    Py_ssize_t segments, layers, r, z, i;
-    
-    if (!PyArg_ParseTuple(args, "O!O!ddnn",
-                            &PyArray_Type,&g_z_pao,
-                            &PyArray_Type,&g_zs_pao,
-                            &lambda_0,
-                            &lambda_1,
-                            &layers,
-                            &segments))
-    {return NULL;}
-
-    g_z=(double *) PyArray_DATA(g_z_pao);
-    g_zs=(double *) PyArray_DATA(g_zs_pao);
-    g_zs_next = g_zs + layers;
-
-    for (r=1; r<segments; ++r) {
-        i = layers*(r-1); // Precalculating this index gave a 30% speedup
-
-        //g_zs[i+layers] = ...
-        *g_zs_next++ = g_z[0] * (
-            g_zs[i] * lambda_0
-            + g_zs[1+i] * lambda_1
-        );
-
-        for (z=1; z<(layers-1); ++z) {
-            *g_zs_next++ = g_z[z] * (
-                g_zs[z+i-1] * lambda_1
-                + g_zs[z+i] * lambda_0
-                + g_zs[z+i+1] * lambda_1
-            );
-        }
-
-        *g_zs_next++ = g_z[z] * (
-            g_zs[z+i] * lambda_0
-            + g_zs[z+i-1] * lambda_1
-        );
-
-    }   
-
-    Py_RETURN_NONE;
-}
-
-// enhanced with pointer magic for a 20% speed decrease?!
-PyObject *Pcalc_g_zs_pointers(PyObject *self, PyObject *args)
-{
-    PyArrayObject *g_z_pao, *c_i_pao, *g_zs_pao;
-    double lambda_0, lambda_1, cval;
-    double *g_z, *c_i, *g_zs, *g_zs_prev, *g_zs_prev_above, *g_zs_prev_below;
-    Py_ssize_t segments, layers, s, z;
-    
-    if (!PyArg_ParseTuple(args, "O!O!O!ddnn",
-                            &PyArray_Type,&g_z_pao,
-                            &PyArray_Type,&c_i_pao,
-                            &PyArray_Type,&g_zs_pao, 
-                            &lambda_0,
-                            &lambda_1,
-                            &layers,
-                            &segments))
-    {return NULL;}
-
-
-    g_z=(double *) PyArray_DATA(g_z_pao);
-    c_i=(double *) PyArray_DATA(c_i_pao)+segments-1; // iterate backwards
-    g_zs=(double *) PyArray_DATA(g_zs_pao);
-    
-    // Skip the first layer, which was defined outside this function    
-    g_zs_prev = g_zs;
-    g_zs_prev_above = g_zs + 1;
-    g_zs_prev_below = g_zs - 1;
-    g_zs += layers;
-
-    /*
-        for pointer voodoo, keep in mind that
-        *ptr++
-        is the same as
-        *(ptr++)
-        meaning, "Pointer moves to the next position (of an array),
-        but returns the old content"
-    */ 
-
-    for (s=1; s<segments; ++s) {
-        cval = *--c_i; // decrement and use the NEW position's content
-
-        //first and last layers have special cases
-        *g_zs++ = (
-                   (*g_zs_prev++) * lambda_0
-                   + (*g_zs_prev_above++) * lambda_1
-                   + cval
-                   ) * (*g_z++);
-        ++g_zs_prev_below; // didn't increment this one in the math
-
-        for (z=1; z<(layers-1); ++z) {
-            *g_zs++ = (
-                       (*g_zs_prev_below++) * lambda_1
-                       + (*g_zs_prev++) * lambda_0
-                       + (*g_zs_prev_above++) * lambda_1
-                       + cval
-                       ) * (*g_z++);
-        }
-
-        //first and last layers have special cases
-        *g_zs++ = (
-                   (*g_zs_prev_below++) * lambda_1
-                   + (*g_zs_prev++) * lambda_0
-                   + cval
-                   ) * (*g_z++);
-        ++g_zs_prev_above; // didn't increment this one in the math
-
-        //reset g_z
-        g_z -= layers;
-
-    }   
-    
-    // note: at this point, accessing g_zs could segfault
-
-    Py_RETURN_NONE;
-}
-
-
-static PyMethodDef calc_g_zs_cex_methods[] =
-{
-    {"_calc_g_zs", Pcalc_g_zs, METH_VARARGS, 
-    "_calc_g_zs(g_z,c_i,g_zs,lambda_0,lambda_1,layers,segments): calculate G(z,s)"
-    },
-    {"_calc_g_zs_uniform", Pcalc_g_zs_uniform, METH_VARARGS, 
-    "_calc_g_zs_uniform(g_z,g_zs,lambda_0,lambda_1,layers,segments): calculate G(z,s) for uniform chains"
-    },
-    {"_calc_g_zs_pointers", Pcalc_g_zs_pointers, METH_VARARGS, 
-    "_calc_g_zs_pointers(g_z,c_i,g_zs,lambda_0,lambda_1,layers,segments): calculate G(z,s) using pointer tricks"
-    },
-    {NULL, NULL, 0, NULL}
-};
-
-/*
-void initcalc_g_zs_cex(void)
-{
-    (void)Py_InitModule("calc_g_zs_cex", calc_g_zs_cex_methods);
-    import_array();
-}
-*/
-
-#define MODULE_DOC "calc_g_zs C Extension"
-#define MODULE_NAME "calc_g_zs_cex"
-#define MODULE_INIT2 initcalc_g_zs_cex
-#define MODULE_INIT3 PyInit_calc_g_zs_cex
-#define MODULE_METHODS calc_g_zs_cex_methods
-
-/* ==== boilerplate python 2/3 interface bootstrap ==== */
-
-#if defined(WIN32) && !defined(__MINGW32__)
-    #define DLL_EXPORT __declspec(dllexport)
-#else
-    #define DLL_EXPORT
-#endif
-
-#if PY_MAJOR_VERSION >= 3
-
-  DLL_EXPORT PyMODINIT_FUNC MODULE_INIT3(void)
-  {
-    PyObject* blah;
-    static struct PyModuleDef moduledef = {
-      PyModuleDef_HEAD_INIT,
-      MODULE_NAME,         /* m_name */
-      MODULE_DOC,          /* m_doc */
-      -1,                  /* m_size */
-      MODULE_METHODS,      /* m_methods */
-      NULL,                /* m_reload */
-      NULL,                /* m_traverse */
-      NULL,                /* m_clear */
-      NULL,                /* m_free */
-    };
-    blah = PyModule_Create(&moduledef);
-    import_array();
-    return blah;
-  }
-
-#else /* PY_MAJOR_VERSION >= 3 */
-
-  DLL_EXPORT PyMODINIT_FUNC MODULE_INIT2(void)
-  {
-    Py_InitModule4(MODULE_NAME,
-		 MODULE_METHODS,
-		 MODULE_DOC,
-		 0,
-		 PYTHON_API_VERSION
-		 );
-    import_array();
-  }
-
-#endif /* PY_MAJOR_VERSION >= 3 */
diff --git a/refl1d/lib/methods.h b/refl1d/lib/methods.h
index 877aaaa9..19c62801 100755
--- a/refl1d/lib/methods.h
+++ b/refl1d/lib/methods.h
@@ -6,92 +6,66 @@
 // TODO: can we free the view before using the pointer with writable vectors?
 /* ***
 // Vector binding glue
-#if (PY_VERSION_HEX > 0x03000000) && !defined(Py_LIMITED_API)
-  // Assuming that a view into a writable vector points to a 
-  // non-changing pointer for the duration of the C call, capture 
-  // the view pointer and immediately free the view.
-  #define VECTOR(VEC_obj, VEC_buf, VEC_len) do { \
-    Py_buffer VEC_view; \
-    int VEC_err = PyObject_GetBuffer(VEC_obj, &VEC_view, PyBUF_WRITABLE|PyBUF_FORMAT); \
-    if (VEC_err < 0 || sizeof(*VEC_buf) != VEC_view.itemsize) return NULL; \
-    VEC_buf = (typeof(VEC_buf))VEC_view.buf; \
-    VEC_len = VEC_view.len/sizeof(*VEC_buf); \
-    PyBuffer_Release(&VEC_view); \
-  } while (0)
-#else
-  #define VECTOR(VEC_obj, VEC_buf, VEC_len) do { \
-    int VEC_err = PyObject_AsWriteBuffer(VEC_obj, (void **)(&VEC_buf), &VEC_len); \
-    if (VEC_err < 0) return NULL; \
-    VEC_len /= sizeof(*VEC_buf); \
-  } while (0)
-#endif
+// Assuming that a view into a writable vector points to a
+// non-changing pointer for the duration of the C call, capture
+// the view pointer and immediately free the view.
+#define VECTOR(VEC_obj, VEC_buf, VEC_len) do { \
+  Py_buffer VEC_view; \
+  int VEC_err = PyObject_GetBuffer(VEC_obj, &VEC_view, PyBUF_WRITABLE|PyBUF_FORMAT); \
+  if (VEC_err < 0 || sizeof(*VEC_buf) != VEC_view.itemsize) return NULL; \
+  VEC_buf = (typeof(VEC_buf))VEC_view.buf; \
+  VEC_len = VEC_view.len/sizeof(*VEC_buf); \
+  PyBuffer_Release(&VEC_view); \
+} while (0)
 *** */
 
 // Vector binding glue
-#if 0 && (PY_VERSION_HEX > 0x03000000) && !defined(Py_LIMITED_API)
-
-  // Buffer protocol for python 3 requires freeing each view
-  // that is retrieved, so need to leave space to retrieve
-  // the buffer view for each vector and free them after they
-  // have been used.  This is particularly problematic if there
-  // is an error discovered while interpreting the inputs, since
-  // we may be freeing a subset.
-  #define DECLARE_VECTORS(VEC_n) \
-   Py_buffer VEC_views[VEC_n]; \
-   int VEC_current = 0; \
-   const int VEC_maximum = VEC_n;
-
-  #define FREE_VECTORS() \
-   do { \
-       int VEC_k = 0; \
-       while (VEC_k < VEC_current) PyBuffer_Release(&VEC_views[VEC_k++]); \
-    } while (0)
-
-  #define _VECTOR(VEC_obj, VEC_buf, VEC_len, VEC_flags) \
-   do { \
-       if (VEC_current >= VEC_maximum) { \
-           PyErr_SetString(PyExc_TypeError, "not enough vectors declared"); \
-           FREE_VECTORS(); \
-           return NULL; \
-       } else { \
-           Py_buffer *VEC_view = &VEC_views[VEC_current]; \
-           int VEC_err = PyObject_GetBuffer(VEC_obj, VEC_view, VEC_flags); \
-           if (VEC_err < 0) { FREE_VECTORS(); return NULL; } \
-           VEC_current++; \
-           if (sizeof(*VEC_buf) != VEC_view->itemsize) { \
-               PyErr_SetString(PyExc_TypeError, "wrong numeric type for vector"); \
-               FREE_VECTORS(); \
-               return NULL; \
-           } \
-           VEC_buf = (typeof(VEC_buf))VEC_view->buf; \
-           VEC_len = VEC_view->len/sizeof(*VEC_buf); \
-       } \
-   } while (0)
-
-  #define INVECTOR(obj, buf, len) _VECTOR(obj, buf, len, PyBUF_SIMPLE|PyBUF_FORMAT)
-  #define OUTVECTOR(obj, buf, len) _VECTOR(obj, buf, len, PyBUF_WRITABLE|PyBUF_FORMAT)
+// Buffer protocol for python 3 requires freeing each view
+// that is retrieved, so need to leave space to retrieve
+// the buffer view for each vector and free them after they
+// have been used.  This is particularly problematic if there
+// is an error discovered while interpreting the inputs, since
+// we may be freeing a subset.
+#define DECLARE_VECTORS(VEC_n) \
+  Py_buffer VEC_views[VEC_n]; \
+  int VEC_current = 0; \
+  const int VEC_maximum = VEC_n;
+
+#define FREE_VECTORS() \
+  do { \
+      int VEC_k = 0; \
+      while (VEC_k < VEC_current) PyBuffer_Release(&VEC_views[VEC_k++]); \
+  } while (0)
 
+#ifdef __cplusplus
+#    define CAST_ASSIGN(to_y, from_x) to_y = (decltype(to_y))(from_x)
 #else
-
-  #define DECLARE_VECTORS(n) do {} while(0)
-  #define FREE_VECTORS() do {} while(0)
-
-  #define INVECTOR(obj,buf,len) \
-    do { \
-        int err = PyObject_AsReadBuffer(obj, (const void **)(&buf), &len); \
-        if (err < 0) return NULL; \
-        len /= sizeof(*buf); \
-    } while (0)
-
-  #define OUTVECTOR(obj,buf,len) \
-    do { \
-        int err = PyObject_AsWriteBuffer(obj, (void **)(&buf), &len); \
-        if (err < 0) return NULL; \
-        len /= sizeof(*buf); \
-    } while (0)
-
+#    define CAST_ASSIGN(to_y, from_x) to_y = (typeof(to_y))(from_x)
 #endif
 
+#define _VECTOR(VEC_obj, VEC_buf, VEC_len, VEC_flags) \
+  do { \
+      if (VEC_current >= VEC_maximum) { \
+          PyErr_SetString(PyExc_TypeError, "not enough vectors declared"); \
+          FREE_VECTORS(); \
+          return NULL; \
+      } else { \
+          Py_buffer *VEC_view = &VEC_views[VEC_current]; \
+          int VEC_err = PyObject_GetBuffer(VEC_obj, VEC_view, VEC_flags); \
+          if (VEC_err < 0) { FREE_VECTORS(); return NULL; } \
+          VEC_current++; \
+          if (sizeof(*VEC_buf) != VEC_view->itemsize) { \
+              PyErr_SetString(PyExc_TypeError, "wrong numeric type for vector"); \
+              FREE_VECTORS(); \
+              return NULL; \
+          } \
+          CAST_ASSIGN(VEC_buf, VEC_view->buf); \
+          VEC_len = VEC_view->len/sizeof(*VEC_buf); \
+      } \
+  } while (0)
+
+#define INVECTOR(obj, buf, len) _VECTOR(obj, buf, len, PyBUF_SIMPLE|PyBUF_FORMAT)
+#define OUTVECTOR(obj, buf, len) _VECTOR(obj, buf, len, PyBUF_WRITABLE|PyBUF_FORMAT)
 #define SCALAR(obj) PyFloat_AsDouble(obj)
 
 
diff --git a/refl1d/polymer.py b/refl1d/polymer.py
index e391d9bc..ee55f56a 100644
--- a/refl1d/polymer.py
+++ b/refl1d/polymer.py
@@ -52,29 +52,21 @@
            "VolumeProfile", "layer_thickness"]
 
 import inspect
-
-import numpy as np
-
-from bumps.parameter import Parameter, to_dict
-from .model import Layer
-from . import util
 from time import time
+from collections import OrderedDict
 
-try:
-    from collections import OrderedDict
-except ImportError:
-    class OrderedDict(dict):
-        def popitem(self, *args, **kw):
-            return dict.popitem(self, *args)
-
+import numpy as np
 from numpy import real, imag, exp, log, sqrt, pi, hstack, ones_like
+from numpy.core.multiarray import correlate as old_correlate
+from bumps.parameter import Parameter, to_dict
 
-# This is okay to use as long as LAMBDA_ARRAY is symmetric,
-# otherwise a slice LAMBDA_ARRAY[::-1] is necessary
-from numpy.core.multiarray import correlate
+from . import util
+from .model import Layer
 
 LAMBDA_1 = 1.0/6.0 #always assume cubic lattice (1/6) for now
 LAMBDA_0 = 1.0 - 2.0*LAMBDA_1
+# Use reverse order for LAMBDA_ARRAY if it is asymmetric since we are using
+# it with correlate().
 LAMBDA_ARRAY = np.array([LAMBDA_1, LAMBDA_0, LAMBDA_1])
 MINLAT = 25
 MINBULK = 5
@@ -1004,7 +996,7 @@ def SCFeqns(phi_z, chi, chi_s, sigma, n_avg, p_i, phi_b=0):
 
     # calculate new g_z (Boltzmann weighting factors)
     u_prime = log((1.0 - phi_z) / (1.0 - phi_b))
-    u_int = 2 * chi * (correlate(phi_z, LAMBDA_ARRAY, 1) - phi_b)
+    u_int = 2 * chi * (old_correlate(phi_z, LAMBDA_ARRAY, 1) - phi_b)
     u_int[0] += chi_s
     u_z = u_prime + u_int
     g_z = exp(u_z)
@@ -1072,31 +1064,15 @@ def calc_phi_z(g_z, n_avg, sigma, phi_b, u_z_avg=0, p_i=None):
 
     return phi_z_ta + phi_z_free
 
-
 def compose(g_zs, g_zs_ngts, g_z):
     prod = g_zs * np.fliplr(g_zs_ngts)
     prod[np.isnan(prod)] = 0
     return np.sum(prod, axis=1) / g_z
 
-
 class Propagator(object):
-    cex = None
-
     def __init__(self, g_z, segments):
         self.g_z = g_z
         self.shape = int(g_z.size), int(segments)
-        # keep all future instances from retrying this test
-        if self.cex is not None:
-            return
-
-        try:
-            import refl1d.calc_g_zs_cex as cex
-            Propagator.cex = cex
-        except ImportError:
-            import warnings
-            warnings.warn('Could not load C extension for EndTetheredPolymer. Continuing with slower NumPy code.\n'
-                          'Try rebuilding refl1d to remove this warning and speed things up!')
-            Propagator.cex = False
 
     def ta(self):
         # terminally attached beginnings
@@ -1105,7 +1081,7 @@ def ta(self):
         g_zs = self._new()
         g_zs[:, 0] = 0.0
         g_zs[0, 0] = self.g_z[0]
-        self._calc_g_zs_uniform(self.g_z, g_zs)
+        _calc_g_zs_uniform(self.g_z, g_zs, LAMBDA_0, LAMBDA_1)
         return g_zs
 
     def free(self):
@@ -1114,7 +1090,7 @@ def free(self):
 
         g_zs = self._new()
         g_zs[:, 0] = self.g_z
-        self._calc_g_zs_uniform(self.g_z, g_zs)
+        _calc_g_zs_uniform(self.g_z, g_zs, LAMBDA_0, LAMBDA_1)
         return g_zs
 
     def ngts_u(self, c):
@@ -1123,7 +1099,7 @@ def ngts_u(self, c):
 
         g_zs = self._new()
         g_zs[:, 0] = c * self.g_z
-        self._calc_g_zs_uniform(self.g_z, g_zs)
+        _calc_g_zs_uniform(self.g_z, g_zs, LAMBDA_0, LAMBDA_1)
         return g_zs
 
     def ngts(self, c_i):
@@ -1132,27 +1108,56 @@ def ngts(self, c_i):
 
         g_zs = self._new()
         g_zs[:, 0] = c_i[-1] * self.g_z
-        self._calc_g_zs(self.g_z, c_i, g_zs)
+        _calc_g_zs(self.g_z, c_i, g_zs, LAMBDA_0, LAMBDA_1)
         return g_zs
 
     def _new(self):
         return np.empty(self.shape, order='F')
 
-    def _calc_g_zs(self, g_z, c_i, g_zs):
-        if self.cex:
-            self.cex._calc_g_zs(g_z, c_i, g_zs, LAMBDA_0, LAMBDA_1, *self.shape)
-        else:
-            pg_zs = g_zs[:, 0]
-            segment_iterator = enumerate(c_i[::-1])
-            next(segment_iterator)
-            for r, c in segment_iterator:
-                g_zs[:, r] = pg_zs = (correlate(pg_zs, LAMBDA_ARRAY, 1) + c) * g_z
-
-    def _calc_g_zs_uniform(self, g_z, g_zs):
-        if self.cex:
-            self.cex._calc_g_zs_uniform(g_z, g_zs, LAMBDA_0, LAMBDA_1, *self.shape)
-        else:
-            segments = g_zs.shape[1]
-            pg_zs = g_zs[:, 0]
-            for r in range(1, segments):
-                g_zs[:, r] = pg_zs = correlate(pg_zs, LAMBDA_ARRAY, 1) * g_z
+try:
+    from numba import njit
+    USE_NUMBA = True
+except ImportError:
+    USE_NUMBA = False
+
+#USE_NUMBA = False # Uncomment when doing timing tests
+
+if USE_NUMBA:
+    @njit('(f8[:], f8[:, :], f8, f8)', cache=True)
+    def _calc_g_zs_uniform(g_z, g_zs, f0, f1):
+        points, segments = g_zs.shape
+        for r in range(segments-1):
+            g_zs[0, r+1] = (g_zs[0, r]*f0 + g_zs[1, r]*f1)*g_z[0]
+            #g_zs[1:-1, r+1] = np.correlate(g_zs[:, r], [f1, f0, f1], 'valid')*g_z[1:-1]
+            for k in range(1, points-1):
+                g_zs[k, r+1] = (
+                    g_zs[k, r]*f0 + (g_zs[k-1, r] + g_zs[k+1, r])*f1)*g_z[k]
+            g_zs[-1, r+1] = (g_zs[-2, r]*f1 + g_zs[-1, r]*f0)*g_z[-1]
+
+    @njit('(f8[:], f8[:], f8[:, :], f8, f8)', cache=True)
+    def _calc_g_zs(g_z, c_i, g_zs, f0, f1):
+        points, segments = g_zs.shape
+        for r in range(segments-1):
+            c_ir = c_i[segments-(r+1)-1]
+            g_zs[0, r+1] = (g_zs[0, r]*f0 + g_zs[1, r]*f1 + c_ir)*g_z[0]
+            #g_zs[1:-1, r+1] = (np.correlate(g_zs[:, r], fir, 'valid') + c_ir)*g_z[1:-1]
+            for k in range(1, points-1):
+                g_zs[k, r+1] = (
+                    g_zs[k, r]*f0 + (g_zs[k-1, r] + g_zs[k+1, r])*f1 + c_ir)*g_z[k]
+            g_zs[-1, r+1] = (g_zs[-2, r]*f1 + g_zs[-1, r]*f0 + c_ir)*g_z[-1]
+
+else:
+    def _calc_g_zs(g_z, c_i, g_zs, f0, f1):
+        coeff = np.array([f1, f0, f1])
+        pg_zs = g_zs[:, 0]
+        segment_iterator = enumerate(c_i[::-1])
+        next(segment_iterator)
+        for r, c in segment_iterator:
+            g_zs[:, r] = pg_zs = (old_correlate(pg_zs, coeff, 1) + c) * g_z
+
+    def _calc_g_zs_uniform(g_z, g_zs, f0, f1):
+        coeff = np.array([f1, f0, f1])
+        segments = g_zs.shape[1]
+        pg_zs = g_zs[:, 0]
+        for r in range(1, segments):
+            g_zs[:, r] = pg_zs = old_correlate(pg_zs, coeff, 1) * g_z
diff --git a/setup.py b/setup.py
index e6a16f3b..12cf59bb 100755
--- a/setup.py
+++ b/setup.py
@@ -67,16 +67,6 @@ def reflmodule_config():
                      py_limited_api="cp32",
                      )
 
-# SCF extension
-def SCFmodule_config():
-    from numpy import get_include
-    return Extension('refl1d.calc_g_zs_cex',
-                     sources=[os.path.join('refl1d', 'lib', 'calc_g_zs_cex.c')],
-                     include_dirs=[get_include()],
-                     #language="c++",
-                     py_limited_api="cp32",
-                     )
-
 #TODO: write a proper dependency checker for packages which cannot be
 # installed by easy_install
 #dependency_check('numpy>=1.0', 'scipy>=0.6', 'matplotlib>=1.0', 'wx>=2.8.9')
@@ -107,9 +97,9 @@ def SCFmodule_config():
         'console_scripts': ['refl1d=refl1d.main:cli'],
         'gui_scripts': ['refl1d_gui=refl1d.main:gui']
     },
-    ext_modules=[reflmodule_config(), SCFmodule_config()],
+    ext_modules=[reflmodule_config()],
     install_requires=['bumps>=0.7.16', 'numpy', 'scipy', 'matplotlib', 'periodictable'],
-    extras_require={'full': ['wxpython']},
+    extras_require={'full': ['wxpython', 'ipython', 'numba']},
     cmdclass={'build_ext': build_ext_subclass},
     )