diff --git a/include/relic_arch.h b/include/relic_arch.h
index f149d4a5d..05ff65e07 100644
--- a/include/relic_arch.h
+++ b/include/relic_arch.h
@@ -95,6 +95,11 @@ ull_t arch_cycles(void);
  */
 uint_t arch_lzcnt(dig_t);
 
+/**
+ * Return the number of trailing zeros in an integer.
+ */
+uint_t arch_tzcnt(dig_t);
+
 #if ARCH == AVR
 
 /**
diff --git a/include/relic_bn.h b/include/relic_bn.h
index 39046483c..bbfa6e740 100644
--- a/include/relic_bn.h
+++ b/include/relic_bn.h
@@ -1541,6 +1541,18 @@ void bn_rec_glv(bn_t k0, bn_t k1, const bn_t k, const bn_t n, const bn_t v1[],
 void bn_rec_frb(bn_t *ki, int sub, const bn_t k, const bn_t x, const bn_t n,
 	int cof);
 
+/**
+ * Recodes subscalars in the signed aligned column representation..
+ *
+ * @param[out] b			- the recoded subscalars.
+ * @param[in] len			- the length in bytes of the recoding.
+ * @param[in] k				- the subscalars to recode.
+ * @param[in] m				- the number of subscallars to recode.
+ * @param[in] n				- the elliptic curve group order.
+ * @throw ERR_NO_BUFFER		- if the buffer capacity is insufficient.
+ */
+void bn_rec_sac(int8_t *b, size_t *len, bn_t *k, size_t m, bn_t n);
+
 /**
  * Computes the coefficients of the polynomial representing the Lagrange
  * interpolation for a modulus and a given set of roots.
diff --git a/include/relic_core.h b/include/relic_core.h
index 13805aeb5..6941e014c 100644
--- a/include/relic_core.h
+++ b/include/relic_core.h
@@ -497,8 +497,10 @@ typedef struct _ctx_t {
 	/** Function pointer to underlying lznct implementation. */
 #if ARCH == X86
 	unsigned int (*lzcnt_ptr)(dig_t);
+	unsigned int (*tzcnt_ptr)(dig_t);
 #elif ARCH == X64 || ARCH == A64
 	unsigned int (*lzcnt_ptr)(ull_t);
+	unsigned int (*tzcnt_ptr)(ull_t);
 #endif
 } ctx_t;
 
diff --git a/src/arch/relic_arch_a64.c b/src/arch/relic_arch_a64.c
index f4dd6277a..1c4f1bd72 100644
--- a/src/arch/relic_arch_a64.c
+++ b/src/arch/relic_arch_a64.c
@@ -36,6 +36,7 @@
 #include "relic_core.h"
 
 #include "lzcnt.inc"
+#include "tzcnt.inc"
 
 /**
  * Renames the inline assembly macro to a prettier name.
@@ -177,6 +178,8 @@ void arch_init(void) {
 	if (ctx != NULL) {
 		core_get()->lzcnt_ptr =
 			(has_lzcnt_hard() ? lzcnt64_hard : lzcnt64_soft);
+		core_get()->tzcnt_ptr =
+			(has_lzcnt_hard() ? tzcnt64_hard : tzcnt64_soft);
 	}
 
 #if TIMER == CYCLE
@@ -199,6 +202,7 @@ void arch_clean(void) {
 	ctx_t *ctx = core_get();
 	if (ctx != NULL) {
 		core_get()->lzcnt_ptr = NULL;
+		core_get()->tzcnt_ptr = NULL;
 	}
 }
 
@@ -234,3 +238,7 @@ ull_t arch_cycles(void) {
 uint_t arch_lzcnt(dig_t x) {
 	return core_get()->lzcnt_ptr((ull_t)x) - (8 * sizeof(ull_t) - WSIZE);
 }
+
+uint_t arch_tzcnt(dig_t x) {
+	return core_get()->tzcnt_ptr(x);
+}
diff --git a/src/arch/relic_arch_arm.c b/src/arch/relic_arch_arm.c
index d6e662676..e079042b4 100644
--- a/src/arch/relic_arch_arm.c
+++ b/src/arch/relic_arch_arm.c
@@ -32,6 +32,7 @@
 #include "relic_types.h"
 
 #include "lzcnt.inc"
+#include "tzcnt.inc"
 
 /**
  * Renames the inline assembly macro to a prettier name.
@@ -111,3 +112,11 @@ uint_t arch_lzcnt(uint_t x) {
 	return lzcnt64_gcc_arm(x);
 #endif
 }
+
+uint_t arch_tzcnt(uint_t x) {
+#ifdef WSIZE == 32
+	return tzcnt32_gcc_arm(x);
+#elif WSIZE == 64
+	return tzcnt64_gcc_arm(x);
+#endif
+}
diff --git a/src/arch/relic_arch_avr.c b/src/arch/relic_arch_avr.c
index 5ca50b16b..220045941 100644
--- a/src/arch/relic_arch_avr.c
+++ b/src/arch/relic_arch_avr.c
@@ -62,3 +62,15 @@ uint_t arch_lzcnt() {
 	}
 	return 0;
 }
+
+uint_t arch_tzcnt() {
+	static const uint8_t table[16] = {
+		4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+	};
+	if (a >> 4 != 0) {
+		return table[a & 0xF];
+	} else {
+		return table[a >> 4] + 4;
+	}
+	return 0;
+}
diff --git a/src/arch/relic_arch_msp.c b/src/arch/relic_arch_msp.c
index db2545492..314cb68d8 100644
--- a/src/arch/relic_arch_msp.c
+++ b/src/arch/relic_arch_msp.c
@@ -119,3 +119,32 @@ uint_t arch_lzcnt() {
 	return 0;
 #endif
 }
+
+uint_t arch_tzcnt() {
+    static const uint8_t table[16] = {
+    	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+    };
+#if WSIZE == 8
+	if (a >> 4 != 0) {
+		return table[a & 0xF];
+	} else {
+		return table[a >> 4] + 4;
+	}
+	return 0;
+#elif WSIZE == 16
+	int offset;
+
+	if (a & 0xFF == 0) {
+		offset = 8;
+	} else {
+		offset = 0;
+	}
+	a = a >> offset;
+	if (a >> 4 != 0) {
+		return table[a & 0xF] + offset;
+	} else {
+		return table[a >> 4] + 4 + offset;
+	}
+	return 0;
+#endif
+}
diff --git a/src/arch/relic_arch_none.c b/src/arch/relic_arch_none.c
index 9063cf574..98f57e744 100644
--- a/src/arch/relic_arch_none.c
+++ b/src/arch/relic_arch_none.c
@@ -83,7 +83,50 @@ uint_t arch_lzcnt(dig_t a) {
 #ifdef _MSC_VER
     return __lzcnt64(a);
 #else
-	return __builtin_clzll(a);
+	return __builtin_clzl(a);
+#endif
+#endif
+}
+
+uint_t arch_tzcnt(dig_t a) {
+#if WSIZE == 8 || WSIZE == 16
+	static const uint8_t table[16] = {
+		4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+	};
+#endif
+#if WSIZE == 8
+	if (a >> 4 != 0) {
+		return table[a & 0xF];
+	} else {
+		return table[a >> 4] + 4;
+	}
+	return 0;
+#elif WSIZE == 16
+	int offset;
+
+	if (a & 0xFF == 0) {
+		offset = 8;
+	} else {
+		offset = 0;
+	}
+	a = a >> offset;
+	if (a >> 4 != 0) {
+		return table[a & 0xF] + offset;
+	} else {
+		return table[a >> 4] + 4 + offset;
+	}
+	return 0;
+#elif WSIZE == 32
+#ifdef _MSC_VER
+    return __tzcnt(a);
+#else
+	return __builtin_ctz(a);
+#endif
+#elif WSIZE == 64
+#ifdef _MSC_VER
+    return __tzcnt64(a);
+#else
+	return __builtin_ctzl(a);
 #endif
 #endif
 }
diff --git a/src/arch/relic_arch_x64.c b/src/arch/relic_arch_x64.c
index 5bef299d8..baebd138a 100644
--- a/src/arch/relic_arch_x64.c
+++ b/src/arch/relic_arch_x64.c
@@ -36,6 +36,7 @@
 #include "relic_core.h"
 
 #include "lzcnt.inc"
+#include "tzcnt.inc"
 
 /**
  * Renames the inline assembly macro to a prettier name.
@@ -51,6 +52,8 @@ void arch_init(void) {
 	if (ctx != NULL) {
 		core_get()->lzcnt_ptr =
 			(has_lzcnt_hard() ? lzcnt64_hard : lzcnt64_soft);
+		core_get()->tzcnt_ptr =
+			(has_tzcnt_hard() ? tzcnt64_hard : tzcnt64_soft);
 	}
 }
 
@@ -58,6 +61,7 @@ void arch_clean(void) {
 	ctx_t *ctx = core_get();
 	if (ctx != NULL) {
 		core_get()->lzcnt_ptr = NULL;
+		core_get()->tzcnt_ptr = NULL;
 	}
 }
 
@@ -103,3 +107,7 @@ ull_t arch_cycles(void) {
 uint_t arch_lzcnt(dig_t x) {
 	return core_get()->lzcnt_ptr((ull_t)x) - (8 * sizeof(ull_t) - WSIZE);
 }
+
+uint_t arch_tzcnt(dig_t x) {
+	return core_get()->tzcnt_ptr(x);
+}
diff --git a/src/arch/relic_arch_x86.c b/src/arch/relic_arch_x86.c
index fc32b5415..2f0bbac2f 100644
--- a/src/arch/relic_arch_x86.c
+++ b/src/arch/relic_arch_x86.c
@@ -36,6 +36,7 @@
 #include "relic_core.h"
 
 #include "lzcnt.inc"
+#include "tzcnt.inc"
 
 /*============================================================================*/
 /* Public definitions                                                         */
@@ -43,10 +44,12 @@
 
 void arch_init(void) {
 	core_get()->lzcnt_ptr = (has_lzcnt_hard() ? lzcnt32_hard : lzcnt32_soft);
+	core_get()->tzcnt_ptr = (has_tzcnt_hard() ? tzcnt32_hard : tzcnt32_soft);
 }
 
 void arch_clean(void) {
 	core_get()->lzcnt_ptr = NULL;
+	core_get()->tzcnt_ptr = NULL;
 }
 
 ull_t arch_cycles(void) {
@@ -64,3 +67,7 @@ ull_t arch_cycles(void) {
 uint_t arch_lzcnt(dig_t x) {
 	return core_get()->lzcnt_ptr((uint32_t)x) - (8 * sizeof(uint32_t) - WSIZE);
 }
+
+uint_t arch_tzcnt(dig_t x) {
+	return core_get()->tzcnt_ptr(x);
+}
diff --git a/src/arch/tzcnt.inc b/src/arch/tzcnt.inc
new file mode 100644
index 000000000..719761e79
--- /dev/null
+++ b/src/arch/tzcnt.inc
@@ -0,0 +1,456 @@
+/*
+Count trailing zero bits. Choice of public domain or MIT-0.
+
+David Reid - mackron@gmail.com
+
+The tzcnt32 and tzcnt64 functions count the number of trailing zero bits in a given 32- or 64-bit variable. When the input variable is 0, the
+total size in bits will be returned (32 for tzcnt32 and 64 for tzcnt64).
+
+For x86/64 platforms, this will use the TZCNT instruction if available. On ARM it will be implemented in terms of the CLZ instruction. If these
+are unavailable it will fall back to compiler-specific built-ins. If these are unavailable it'll fall back to the generic implementation.
+
+
+License
+=======
+
+Public Domain (www.unlicense.org)
+-------------------------------------------------
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
+
+
+Functions
+---------
+tzcnt32_generic
+tzcnt64_generic
+    Generic implementation.
+
+tzcnt32_msvc_bsf
+tzcnt64_msvc_bsf
+    MSVC built-in implementation using _BitScanForward/64(). Note that tzcnt64_mscv_bsf() is only available when compiling as 64-bit.
+
+tzcnt32_gcc_builtin
+tzcnt64_gcc_builtin
+    GCC/Clang built-in implementation using __builtin_ctzl/l(). Note that tzcnt64_gcc_builtin() is only available when compiling as 64-bit.
+
+tzcnt32_msvc_x86
+tzcnt64_msvc_x64
+    MSVC implementation in terms of the __lzcnt and __lzcnt64 intrinsic. Note that these are only available when targeting x86/64. tzcnt64_msvc_x64()
+    is only available when compiling as 64-bit.
+
+tzcnt32_gcc_x86
+tzcnt64_gcc_x64
+    GCC/Clang inline assembly implementation. This will emit the TZCNT instruction. Note that these are only available when targeting x86/x64
+    and when compiled using a compiler that supports GCC style inline assembly.
+
+tzcnt32_gcc_arm
+tzcnt64_gcc_arm
+    GCC/Clang inline assembly implementation. This will be implemented in terms of the CLZ instruction. Note that these are only available when
+    targeting ARM architecture version 5 and above and when compiled using a compiler that supports GCC style inline assembly.
+
+tzcnt32_hard
+tzcnt64_hard
+    High level helper for calling an hardware implementation. This will choose either tzcnt32_msvc_x86()/tzcnt64_msvc_x64() or tzcnt32_gcc_x86()/
+    tzcnt64_gcc_x64() depending on the environment. Note that this is only available when targeting x86/64. tzcnt64_hard() is only available
+    when compiling as 64-bit. You should only call this if has_tzcnt_hard() returns non-zero.
+
+tzcnt32_soft
+tzcnt64_soft
+    High level helper for calling the best software implementation available for the current build environment.
+
+tzcnt32
+tzcnt64
+    High level helper for calling either a hardware or software implementation depending on the build environment. This will always favor a
+    hardware implementation. Do not call this in high performance code. The reason for this is that each it will call has_tzcnt_hard() each
+    time which may be too fine grained for your purposes. You may be better off calling has_tzcnt_hard() once at a higher level.
+
+has_tzcnt_hard
+    Determines whether or not a hardware implementation of tzcnt is available. Use this to know whether or not you can call tzcnt32/64_hard().
+    Note that this calls CPUID for each, so you may want to cache the result. Use HAS_TZCNT32/64_HARD to check for compile-time support.
+*/
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+#if defined(__i386) || defined(_M_IX86)
+    #define ARCH_X86
+#elif defined(__x86_64__) || defined(_M_X64)
+    #define ARCH_X64
+#elif (defined(__arm__) && defined(__ARM_ARCH) && __ARM_ARCH >= 5) || (defined(_M_ARM) && _M_ARM >= 5) || defined(__ARM_FEATURE_CLZ) /* ARM (Architecture Version 5) */
+    #define ARCH_ARM
+#endif
+
+#if defined(_WIN64) || defined(_LP64) || defined(__LP64__)
+    #define ARCH_64BIT
+#else
+    #define ARCH_32BIT
+#endif
+
+#if defined(ARCH_X86) || defined(ARCH_X64)
+    /* x86/64 */
+    #if defined(_MSC_VER) && _MSC_VER >= 1500
+        #define HAS_TZCNT32_HARD
+        #if defined(ARCH_64BIT)
+            #define HAS_TZCNT64_HARD
+        #endif
+    #elif defined(__GNUC__) || defined(__clang__)
+        #define HAS_TZCNT32_HARD
+        #if defined(ARCH_64BIT)
+            #define HAS_TZCNT64_HARD
+        #endif
+    #endif
+#elif defined(ARCH_ARM)
+    /* ARM */
+    #if defined(__GNUC__) || defined(__clang__)
+        #define HAS_TZCNT32_HARD
+        #if defined(ARCH_64BIT)
+            #define HAS_TZCNT64_HARD
+        #endif
+    #endif
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(__clang__)
+    #define HAS_TZCNT_INTRINSIC
+#elif (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+    #define HAS_TZCNT_INTRINSIC
+#elif defined(__clang__)
+    #if defined(__has_builtin)
+        #if __has_builtin(__builtin_ctzll) || __has_builtin(__builtin_ctzl)
+            #define HAS_TZCNT_INTRINSIC
+        #endif
+    #endif
+#endif
+
+inline unsigned int tzcnt32_generic(unsigned int x)
+{
+    unsigned int n;
+
+    /* Special case for odd numbers since they should happen about half the time. */
+    if (x & 0x1)  {
+        return 0;
+    }
+
+    if (x == 0) {
+        return sizeof(x) << 3;
+    }
+
+    n = 1;
+    if ((x & 0x0000FFFF) == 0) { x >>= 16; n += 16; }
+    if ((x & 0x000000FF) == 0) { x >>=  8; n +=  8; }
+    if ((x & 0x0000000F) == 0) { x >>=  4; n +=  4; }
+    if ((x & 0x00000003) == 0) { x >>=  2; n +=  2; }
+    n -= x & 0x00000001;
+
+    return n;
+}
+
+inline unsigned int tzcnt64_generic(unsigned long long x)
+{
+    unsigned int n;
+
+    /* Special case for odd numbers since they should happen about half the time. */
+    if (x & 0x1)  {
+        return 0;
+    }
+
+    if (x == 0) {
+        return sizeof(x) << 3;
+    }
+
+    n = 1;
+    if ((x & 0xFFFFFFFF) == 0) { x >>= 32; n += 32; }
+    if ((x & 0x0000FFFF) == 0) { x >>= 16; n += 16; }
+    if ((x & 0x000000FF) == 0) { x >>=  8; n +=  8; }
+    if ((x & 0x0000000F) == 0) { x >>=  4; n +=  4; }
+    if ((x & 0x00000003) == 0) { x >>=  2; n +=  2; }
+    n -= x & 0x00000001;
+
+    return n;
+}
+
+/* Generic compiler specific intrinsics. */
+#if defined(_MSC_VER)
+static unsigned int tzcnt32_msvc_bsf(unsigned int x)
+{
+    unsigned long n;
+
+    if (x == 0) {
+        return 32;
+    }
+
+    _BitScanForward(&n, x);
+
+    return n;
+}
+
+/* _BitScanReverse64() is only available on 64-bit builds. */
+#if defined(ARCH_64BIT)
+static unsigned int tzcnt64_msvc_bsf(unsigned long long x)
+{
+    unsigned long n;
+
+    if (x == 0) {
+        return 64;
+    }
+
+    _BitScanForward64(&n, x);
+
+    return n;
+}
+#endif  /* ARCH_64BIT */
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(HAS_TZCNT_INTRINSIC)
+static unsigned int tzcnt32_gcc_builtin(unsigned int x)
+{
+    if (x == 0) {
+        return 32;
+    }
+
+    return (unsigned int)__builtin_ctzl((unsigned long)x);
+}
+
+static unsigned int tzcnt64_gcc_builtin(unsigned long long x)
+{
+    if (x == 0) {
+        return 64;
+    }
+
+    return (unsigned int)__builtin_ctzll(x);
+}
+#endif
+
+static int has_tzcnt_hard()
+{
+#if defined(ARCH_X86) || defined(ARCH_X64)
+    int info[4] = {0};
+
+    #if defined(_MSC_VER)
+        __cpuid(info, 0x80000001);
+    #elif defined(__GNUC__) || defined(__clang__)
+        /*
+        It looks like the -fPIC option uses the ebx register which GCC complains about. We can work around this by just using a different register, the
+        specific register of which I'm letting the compiler decide on. The "k" prefix is used to specify a 32-bit register. The {...} syntax is for
+        supporting different assembly dialects.
+
+        What's basically happening is that we're saving and restoring the ebx register manually.
+        */
+        #if defined(ARCH_X86) && defined(__PIC__)
+            __asm__ __volatile__ (
+                "xchg{l} {%%}ebx, %k1;"
+                "cpuid;"
+                "xchg{l} {%%}ebx, %k1;"
+                : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(0x80000001), "c"(0)
+            );
+        #else
+            __asm__ __volatile__ (
+                "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(0x80000001), "c"(0)
+            );
+        #endif
+    #endif
+
+    return (info[2] & (1 << 5)) != 0;
+#elif defined(ARCH_ARM)
+    return 1;   /* The CLZ instruction is available starting from ARM architecture version 5. Our ARCH_ARM #define is only defined when targeting version 5 at compile time. */
+#else
+    return 0;   /* Hardware TZCNT is only supported in x86/64 and ARM for now. */
+#endif
+}
+
+/* Intrinsics and inline-assembly. x86/64 has a hardware TZCNT instruction. You can only call these if has_tzcnt_hard() returns true. */
+#if defined(HAS_TZCNT32_HARD)
+    #if defined(ARCH_X86) || defined(ARCH_X64)
+        #if defined(_MSC_VER) && !defined(__clang__)
+            /* Unfortunately no tzcnt instrinsic on MSVC, but we can build it in terms of lzcnt(). */
+            static unsigned int tzcnt32_msvc_x86(unsigned int x)
+            {
+                if (x == 0) {
+                    return sizeof(x) << 3;
+                }
+
+                return 31 - __lzcnt(x & -(int)x);
+            }
+        #elif defined(__GNUC__) || defined(__clang__)
+            static unsigned int tzcnt32_gcc_x86(unsigned int x)
+            {
+                /*
+                att:   tzcntl [out], [in]
+                intel: tzcnt  [in], [out]
+                */
+                unsigned int r;
+                __asm__ __volatile__ (
+                    "tzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
+                );
+
+                return r;
+            }
+        #endif
+    #endif
+    #if defined(ARCH_ARM)
+        #if defined(__GNUC__) || defined(__clang__)
+            /* The ARM implementation needs to be written in terms of the CLZ instruction. This can probably be optimized by implementing the whole function in assembly. */
+            static unsigned int tzcnt32_gcc_arm(unsigned int x)
+            {
+                unsigned int r;
+
+                if (x == 0) {
+                    return sizeof(x) << 3;
+                }
+
+                __asm__ __volatile__ (
+                #if defined(ARCH_32BIT)
+                    "clz %[out], %[in]" : [out]"=r"(r) : [in]"r"(x)
+                #else
+                    "clz %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(x)
+                #endif
+                );
+                
+                return 31 - r;
+            }
+        #endif
+    #endif
+
+    static unsigned int tzcnt32_hard(unsigned int x)
+    {
+        #if defined(ARCH_X86) || defined(ARCH_X64)
+            #if defined(_MSC_VER) && !defined(__clang__)
+                return tzcnt32_msvc_x86(x);
+            #elif defined(__GNUC__) || defined(__clang__)
+                return tzcnt32_gcc_x86(x);
+            #else
+                #error "This compiler does not support the tzcnt intrinsic."
+            #endif
+        #elif defined(ARCH_ARM)
+            #if defined(__GNUC__) || defined(__clang__)
+                return tzcnt32_gcc_arm(x);
+            #else
+                #error "This compiler does not support the clz intrinsic."
+            #endif
+        #else
+            #error "The build target does not support a native instruction."
+        #endif
+    }
+#endif
+
+#if defined(HAS_TZCNT64_HARD)
+    #if defined(ARCH_X86) || defined(ARCH_X64)
+        #if defined(_MSC_VER) && !defined(__clang__)
+            static unsigned int tzcnt64_msvc_x64(unsigned long long x)
+            {
+                return _tzcnt_u64(x);
+            }
+        #elif defined(__GNUC__) || defined(__clang__)
+            static unsigned int tzcnt64_gcc_x64(unsigned long long x)
+            {
+                /*
+                att:   tzcnt [out], [in]
+                intel: tzcnt [in], [out]
+                */
+                unsigned long long r;
+                __asm__ __volatile__ (
+                    "tzcnt{ %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc"
+                );
+
+                return r;
+            }
+        #endif
+    #endif
+    #if defined(ARCH_ARM)
+        #if defined(__GNUC__) || defined(__clang__)
+            static unsigned int tzcnt64_gcc_arm(unsigned long long x)
+            {
+				return __builtin_ctzl(x);
+            }
+        #endif
+    #endif
+
+    static unsigned int tzcnt64_hard(unsigned long long x)
+    {
+    #if defined(ARCH_X64)
+        #if defined(_MSC_VER) && !defined(__clang__)
+            return tzcnt64_msvc_x64(x);
+        #elif defined(__GNUC__) || defined(__clang__)
+            return tzcnt64_gcc_x64(x);
+        #else
+            #error "This compiler does not support the tzcnt intrinsic."
+        #endif
+    #elif defined(ARCH_ARM) && defined(ARCH_64BIT)
+        #if defined(__GNUC__) || defined(__clang__)
+            return tzcnt64_gcc_arm(x);
+        #else
+            #error "This compiler does not support the clz intrinsic."
+        #endif
+    #else
+        #error "The build target does not support a native instruction."
+    #endif
+    }
+#endif
+
+
+static unsigned int tzcnt32_soft(unsigned int x)
+{
+#if defined(_MSC_VER)
+    return tzcnt32_msvc_bsf(x);
+#elif defined(HAS_TZCNT_INTRINSIC)
+    return tzcnt32_gcc_builtin(x);
+#else
+    return tzcnt32_generic(x);
+#endif
+}
+
+static unsigned int tzcnt64_soft(unsigned long long x)
+{
+#if defined(ARCH_64BIT)
+    #if defined(_MSC_VER)
+        return tzcnt64_msvc_bsf(x);
+    #elif defined(HAS_TZCNT_INTRINSIC)
+        return tzcnt64_gcc_builtin(x);
+    #else
+        return tzcnt64_generic(x);
+    #endif
+#else
+    return tzcnt64_generic(x);
+#endif
+}
+
+
+inline static unsigned int tzcnt32(unsigned int x)
+{
+#if defined(HAS_TZCNT32_HARD)
+    if (has_tzcnt_hard()) {
+        return tzcnt32_hard(x);
+    } else
+#endif
+    {
+        return tzcnt32_soft(x);
+    }
+}
+
+inline static unsigned int tzcnt64(unsigned int x)
+{
+#if defined(HAS_TZCNT64_HARD)
+    if (has_tzcnt_hard()) {
+        return tzcnt64_hard(x);
+    } else
+#endif
+    {
+        return tzcnt64_soft(x);
+    }
+}
diff --git a/src/bn/relic_bn_rec.c b/src/bn/relic_bn_rec.c
index d48b63660..2f8b2c210 100644
--- a/src/bn/relic_bn_rec.c
+++ b/src/bn/relic_bn_rec.c
@@ -876,16 +876,24 @@ void bn_rec_glv(bn_t k0, bn_t k1, const bn_t k, const bn_t n, const bn_t *v1,
 	}
 }
 
-void bn_rec_sac(bn_t *b, bn_t *k, size_t m, bn_t n) {
+void bn_rec_sac(int8_t *b, size_t *len, bn_t *k, size_t m, bn_t n) {
 	/* Assume k0 is the sign-aligner. */
 	bn_t *t = RLC_ALLOCA(bn_t, m);
 	size_t l = RLC_CEIL(bn_bits(n), m) + 1;
+	int8_t bji;
 
 	if (t == NULL) {
 		RLC_THROW(ERR_NO_MEMORY);
 		return;
 	}
 
+	if (*len <= l) {
+		*len = 0;
+		RLC_FREE(t);
+		RLC_THROW(ERR_NO_BUFFER);
+		return;
+	}
+
 	RLC_TRY {
 		for (size_t i = 0; i < m; i++) {
 			bn_null(t[i]);
@@ -893,25 +901,32 @@ void bn_rec_sac(bn_t *b, bn_t *k, size_t m, bn_t n) {
 			bn_copy(t[i], k[i]);
 		}
 
-		bn_set_bit(b[0], l - 1, 0);
+		/* The current basis for BN curves might be one bit longer. */
+		for (size_t i = 0; i < m; i++) {
+			l = RLC_MAX(l, bn_bits(k[i]) + 1);
+		}
+
+		b[l - 1] = 0;
 		for (size_t i = 0; i < l - 1; i++) {
-			bn_set_bit(b[0], i, 1 - bn_get_bit(k[0], i + 1));
+			b[i] = 1 - bn_get_bit(k[0], i + 1);
 		}
 		for (size_t j = 1; j < m; j++) {
-			for (size_t i = 0; i < l; i++) {
-				uint8_t bji = bn_get_bit(t[j], 0);
-				bn_set_bit(b[j], i, bji);
+			for (size_t i = 0; i < l - 1; i++) {
+				bji = bn_get_bit(t[j], 0);
+				b[j * l + i] = bji;
 				bn_hlv(t[j], t[j]);
-				bn_add_dig(t[j], t[j], bji & bn_get_bit(b[0], i));
+				bn_add_dig(t[j], t[j], bji & b[i]);
 			}
+			b[j * l + l - 1] = bn_get_bit(t[j], 0);
 		}
+		*len = l;
 	} RLC_CATCH_ANY {
 		RLC_THROW(ERR_CAUGHT);
 	} RLC_FINALLY {
 		for (size_t i = 0; i < m; i++) {
 			bn_free(t[i]);
-			RLC_FREE(t);
 		}
+		RLC_FREE(t);
 	}
 }
 
diff --git a/src/bn/relic_bn_smb.c b/src/bn/relic_bn_smb.c
index 98f74dd82..d1446ae0e 100644
--- a/src/bn/relic_bn_smb.c
+++ b/src/bn/relic_bn_smb.c
@@ -77,14 +77,21 @@ int bn_smb_leg(const bn_t a, const bn_t b) {
 }
 
 int bn_smb_jac(const bn_t a, const bn_t b) {
-	bn_t t0, t1, r;
-	int t, h, res;
+	dis_t ai, bi, ci, di;
+	dig_t n, d, t;
+	bn_t t0, t1, t2, t3;
+	uint_t z, i, s = (RLC_DIG >> 1) - 2;
+	int r;
 
 	bn_null(t0);
 	bn_null(t1);
-	bn_null(r);
+	bn_null(t2);
+	bn_null(t3);
 
-	/* Argument b must be odd. */
+	/* Optimized Pornin's Algorithm by Aleksei Vambol from
+	 * https://github.com/privacy-scaling-explorations/halo2curves/pull/95 */
+
+	/* Argument b must be odd for Jacobi symbol. */
 	if (bn_is_even(b) || bn_sign(b) == RLC_NEG) {
 		RLC_THROW(ERR_NO_VALID);
 		return 0;
@@ -93,55 +100,119 @@ int bn_smb_jac(const bn_t a, const bn_t b) {
 	RLC_TRY {
 		bn_new(t0);
 		bn_new(t1);
-		bn_new(r);
-		t = 1;
+		bn_new(t2);
+		bn_new(t3);
 
-		if (bn_sign(a) == RLC_NEG) {
-			bn_add(t0, a, b);
-		} else {
-			bn_copy(t0, a);
-		}
+		bn_mod(t0, a, b);
 		bn_copy(t1, b);
+		t = 0;
 
 		while (1) {
-			/* t0 = a mod b. */
-			bn_mod(t0, t0, t1);
-			/* If a = 0 then if n = 1 return t else return 0. */
-			if (bn_is_zero(t0)) {
-				if (bn_cmp_dig(t1, 1) == RLC_EQ) {
-					res = 1;
-					if (t == -1) {
-						res = -1;
+			ai = di = 1;
+			bi = ci = 0;
+
+			i = RLC_MAX(t0->used, t1->used);
+			dv_zero(t0->dp + t0->used, i - t0->used);
+			dv_zero(t1->dp + t1->used, i - t1->used);
+			if (i == 1) {
+				n = t0->dp[0];
+				d = t1->dp[0];
+				while (n != 0) {
+					if (n & 1) {
+						if (n < d) {
+							RLC_SWAP(n, d);
+							t ^= (n & d);
+						}
+						n = (n - d) >> 1;
+						t ^= d ^ (d >> 1);
+					} else {
+						z = arch_tzcnt(n);
+						t ^= (d ^ (d >> 1)) & (z << 1);
+						n >>= z;
+					}
+				}
+				r = (d == 1 ? 1 - (t & 2) : 0);
+				break;
+			}
+
+			z = RLC_MIN(arch_lzcnt(t0->dp[i - 1]), arch_lzcnt(t1->dp[i - 1]));
+			n = t0->dp[i - 1] << z;
+			d = t1->dp[i - 1] << z;
+			if (z > (RLC_DIG >> 1)) {
+				n |= t0->dp[i - 2] >> z;
+				d |= t1->dp[i - 2] >> z;
+			}
+			n = (n & RLC_HMASK) | (t0->dp[0] & RLC_LMASK);
+			d = (d & RLC_HMASK) | (t1->dp[0] & RLC_LMASK);
+
+			i = s;
+			while (i > 0) {
+				if (n & 1) {
+					if (n < d) {
+						RLC_SWAP(ai, ci);
+						RLC_SWAP(bi, di);
+						RLC_SWAP(n, d);
+						t ^= (n & d);
 					}
-					break;
+					n = (n - d) >> 1;
+					ai = ai - ci;
+					bi = bi - di;
+					ci += ci;
+					di += di;
+					t ^= d ^ (d >> 1);
+					i -= 1;
 				} else {
-					res = 0;
-					break;
+					z = RLC_MIN(i, arch_tzcnt(n));
+					t ^= (d ^ (d >> 1)) & (z << 1);
+					ci = (dig_t)ci << z;
+					di = (dig_t)di << z;
+					n >>= z;
+					i -= z;
 				}
 			}
-			/* Write t0 as 2^h * t0. */
-			h = 0;
-			while (bn_is_even(t0) && !bn_is_zero(t0)) {
-				h++;
-				bn_rsh(t0, t0, 1);
+
+			if (ai < 0) {
+				bn_mul_dig(t2, t0, -ai);
+				bn_neg(t2, t2);
+			} else {
+				bn_mul_dig(t2, t0, ai);
 			}
-			/* If h != 0 (mod 2) and n != +-1 (mod 8) then t = -t. */
-			bn_mod_2b(r, t1, 3);
-			if ((h % 2 != 0) && (bn_cmp_dig(r, 1) != RLC_EQ) &&
-					(bn_cmp_dig(r, 7) != RLC_EQ)) {
-				t = -t;
+			if (bi < 0) {
+				bn_mul_dig(t3, t1, -bi);
+				bn_neg(t3, t3);
+			} else {
+				bn_mul_dig(t3, t1, bi);
 			}
-			/* If t0 != 1 (mod 4) and n != 1 (mod 4) then t = -t. */
-			bn_mod_2b(r, t0, 2);
-			if (bn_cmp_dig(r, 1) != RLC_EQ) {
-				bn_mod_2b(r, t1, 2);
-				if (bn_cmp_dig(r, 1) != RLC_EQ) {
-					t = -t;
-				}
+			bn_add(t3, t3, t2);
+
+			if (ci < 0) {
+				bn_mul_dig(t2, t0, -ci);
+				bn_neg(t2, t2);
+			} else {
+				bn_mul_dig(t2, t0, ci);
+			}
+			if (di < 0) {
+				bn_mul_dig(t1, t1, -di);
+				bn_neg(t1, t1);
+			} else {
+				bn_mul_dig(t1, t1, di);
+			}
+			bn_add(t1, t1, t2);
+			bn_rsh(t1, t1, s);
+			bn_rsh(t0, t3, s);
+
+			if (bn_is_zero(t0)) {
+				r = (bn_cmp_dig(t1, 1) == RLC_EQ ? 1 - (t & 2) : 0);
+				break;
+			}
+
+			if (bn_sign(t0) == RLC_NEG) {
+				t ^= t1->dp[0];
+				bn_neg(t0, t0);
+			}
+			if (bn_sign(t1) == RLC_NEG) {
+				bn_neg(t1, t1);
 			}
-			bn_copy(r, t0);
-			bn_copy(t0, t1);
-			bn_copy(t1, r);
 		}
 	}
 	RLC_CATCH_ANY {
@@ -150,8 +221,9 @@ int bn_smb_jac(const bn_t a, const bn_t b) {
 	RLC_FINALLY {
 		bn_free(t0);
 		bn_free(t1);
-		bn_free(r);
+		bn_free(t2);
+		bn_free(t3);
 	}
 
-	return res;
+	return r;
 }
diff --git a/src/ep/relic_ep_map.c b/src/ep/relic_ep_map.c
index fb923a61e..c96b78795 100644
--- a/src/ep/relic_ep_map.c
+++ b/src/ep/relic_ep_map.c
@@ -258,6 +258,7 @@ void ep_map_swift(ep_t p, const uint8_t *msg, size_t len) {
 	bn_null(k);
 	fp_null(v);
 	fp_null(w);
+	fp_null(y);
 	fp_null(t1);
 	fp_null(t2);
 	fp_null(x1);
@@ -271,6 +272,7 @@ void ep_map_swift(ep_t p, const uint8_t *msg, size_t len) {
 		bn_new(k);
 		fp_new(v);
 		fp_new(w);
+		fp_new(y);
 		fp_new(t1);
 		fp_new(t2);
 		fp_new(x1);
@@ -478,6 +480,7 @@ void ep_map_swift(ep_t p, const uint8_t *msg, size_t len) {
 		bn_free(k);
 		fp_free(v);
 		fp_free(w);
+		fp_free(y);
 		fp_free(t1);
 		fp_free(t2);
 		fp_free(x1);
@@ -488,8 +491,7 @@ void ep_map_swift(ep_t p, const uint8_t *msg, size_t len) {
 		fp_free(d[2]);
 		RLC_FREE(pseudo_random_bytes);
 		for (size_t i = 0; i < 8; i++) {
-			fp_null(h[i]);
-			fp_new(h[i]);
+			fp_free(h[i]);
 		}
 	}
 }
diff --git a/src/epx/relic_ep2_map.c b/src/epx/relic_ep2_map.c
index a99998f8b..447050d53 100644
--- a/src/epx/relic_ep2_map.c
+++ b/src/epx/relic_ep2_map.c
@@ -516,6 +516,12 @@ void ep2_map_swift(ep2_t p, const uint8_t *msg, size_t len) {
 	}
 	RLC_FINALLY {
 		bn_free(k);
+		fp2_free(a);
+		fp2_free(b);
+		fp2_free(c);
+		fp2_free(d);
+		fp2_free(e);
+		fp2_free(f);
 		fp2_free(t);
 		fp2_free(u);
 		fp2_free(v);
diff --git a/src/epx/relic_ep2_mul.c b/src/epx/relic_ep2_mul.c
index 68904d8f5..2e2386cbf 100644
--- a/src/epx/relic_ep2_mul.c
+++ b/src/epx/relic_ep2_mul.c
@@ -115,100 +115,102 @@ static void ep2_mul_gls_imp(ep2_t r, const ep2_t p, const bn_t k) {
 #if EP_MUL == LWREG || !defined(STRIP)
 
 static void ep2_mul_reg_gls(ep2_t r, const ep2_t p, const bn_t k) {
-	int8_t reg[4][RLC_FP_BITS + 1], b[4], s[4], c0, n0;
-	ep2_t q, w, t[4][1 << (RLC_WIDTH - 2)];
+	size_t l;
 	bn_t n, _k[4], u;
-	size_t l, len, _l[4];
+	int8_t even, col, sac[4 * (RLC_FP_BITS + 1)];
+	ep2_t q[4], t[1 << 3];
 
 	bn_null(n);
 	bn_null(u);
-	ep2_null(q);
-	ep2_null(w);
 
 	RLC_TRY {
 		bn_new(n);
 		bn_new(u);
-		ep2_new(q);
-		ep2_new(w);
-		for (size_t i = 0; i < 4; i++) {
+		for (int i = 0; i < 4; i++) {
 			bn_null(_k[i]);
+			ep2_null(q[i]);
 			bn_new(_k[i]);
-			for (size_t j = 0; j < (1 << (RLC_WIDTH - 2)); j++) {
-				ep2_null(t[i][j]);
-				ep2_new(t[i][j]);
-			}
+			ep2_new(q[i]);
+		}
+		for (int i = 0; i < (1 << 3); i++) {
+			ep2_null(t[i]);
+			ep2_new(t[i]);
 		}
 
 		ep2_curve_get_ord(n);
 		fp_prime_get_par(u);
 		bn_mod(_k[0], k, n);
 		bn_rec_frb(_k, 4, _k[0], u, n, ep_curve_is_pairf() == EP_BN);
-
-		l = 0;
-		/* Make some extra room for BN curves that grow subscalars by 1. */
-		len = bn_bits(u) + (ep_curve_is_pairf() == EP_BN);
-		ep2_norm(t[0][0], p);
+		ep2_norm(q[0], p);
+		ep2_frb(q[1], q[0], 1);
+		ep2_frb(q[2], q[1], 1);
+		ep2_frb(q[3], q[2], 1);
 		for (size_t i = 0; i < 4; i++) {
-			s[i] = bn_sign(_k[i]);
-			bn_abs(_k[i], _k[i]);
-			b[i] = bn_is_even(_k[i]);
-			_k[i]->dp[0] |= b[i];
-
-			_l[i] = RLC_FP_BITS + 1;
-			bn_rec_reg(reg[i], &_l[i], _k[i], len, RLC_WIDTH);
-			l = RLC_MAX(l, _l[i]);
-			
-			/* Apply Frobenius before flipping sign to build table. */
-			if (i > 0) {
-				ep2_frb(t[i][0], t[i - 1][0], 1);
-			}
+			ep2_neg(r, q[i]);
+			fp2_copy_sec(q[i]->y, r->y, bn_sign(_k[i]) == RLC_NEG);
+			_k[i]->sign = RLC_POS;
 		}
+		even = bn_is_even(_k[0]);
+		bn_add_dig(_k[0], _k[0], even);
 
-		for (size_t i = 0; i < 4; i++) {
-			ep2_neg(q, t[i][0]);
-			fp2_copy_sec(q->y, t[i][0]->y, s[i] == RLC_POS);
-			ep2_tab(t[i], q, RLC_WIDTH);
+		ep2_copy(t[0], q[0]);
+		for (size_t i = 1; i < (1 << 3); i++) {
+			l = util_bits_dig(i);
+			ep2_add(t[i], t[i ^ (1 << (l - 1))], q[l]);
 		}
 
+		l = RLC_FP_BITS + 1;
+		bn_rec_sac(sac, &l, _k, 4, n);
+
 #if defined(EP_MIXED)
-		fp2_set_dig(w->z, 1);
-		w->coord = BASIC;
+		ep2_norm_sim(t + 1, t + 1, (1 << 3) - 1);
+		fp2_set_dig(r->z, 1);
+		fp2_set_dig(q[1]->z, 1);
+		r->coord = q[1]->coord = BASIC;
 #else
-		w->coord = = EP_ADD;
+		r->coord = q[1]->coord = EP_ADD;
 #endif
 
-		ep2_set_infty(r);
-		for (int j = l - 1; j >= 0; j--) {
-			for (size_t i = 0; i < RLC_WIDTH - 1; i++) {
-				ep2_dbl(r, r);
-			}
+		col = 0;
+		for (int i = 3; i > 0; i--) {
+			col <<= 1;
+			col += sac[i * l + l - 1];
+		}
+		for (size_t m = 0; m < (1 << 3); m++) {
+			fp2_copy_sec(r->x, t[m]->x, m == col);
+			fp2_copy_sec(r->y, t[m]->y, m == col);
+#if !defined(EP_MIXED)
+			fp2_copy_sec(r->z, t[m]->z, m == col);
+#endif
+		}
 
-			for (size_t i = 0; i < 4; i++) {
-				n0 = reg[i][j];
-				c0 = (n0 >> 7);
-				n0 = ((n0 ^ c0) - c0) >> 1;
-
-				for (size_t m = 0; m < (1 << (RLC_WIDTH - 2)); m++) {
-					fp2_copy_sec(w->x, t[i][m]->x, m == n0);
-					fp2_copy_sec(w->y, t[i][m]->y, m == n0);
-	#if !defined(EP_MIXED)
-					fp2_copy_sec(w->z, t[i][m]->z, m == n0);
-	#endif
-				}
+		ep2_neg(q[1], r);
+		fp2_copy_sec(r->y, q[1]->y, sac[l - 1] != 0);
+		for (int j = l - 2; j >= 0; j--) {
+			ep2_dbl(r, r);
 
-				ep2_neg(q, w);
-				fp2_copy_sec(q->y, w->y, c0 == 0);
-				ep2_add(r, r, q);
+			col = 0;
+			for (int i = 3; i > 0; i--) {
+				col <<= 1;
+				col += sac[i * l + j];
 			}
+			
+			for (size_t m = 0; m < (1 << 3); m++) {
+				fp2_copy_sec(q[1]->x, t[m]->x, m == col);
+				fp2_copy_sec(q[1]->y, t[m]->y, m == col);
+#if !defined(EP_MIXED)
+				fp2_copy_sec(q[1]->z, t[m]->z, m == col);
+#endif
+			}
+			ep2_neg(q[2], q[1]);
+			fp2_copy_sec(q[1]->y, q[2]->y, sac[j]);
+			ep2_add(r, r, q[1]);
 		}
 
-		for (size_t i = 0; i < 4; i++) {
-			/* Tables are built with points already negated, so no need here. */
-			ep2_sub(q, r, t[i][0]);
-			fp2_copy_sec(r->x, q->x, b[i]);
-			fp2_copy_sec(r->y, q->y, b[i]);
-			fp2_copy_sec(r->z, q->z, b[i]);
-		}
+		ep2_sub(q[1], r, q[0]);
+		fp2_copy_sec(r->x, q[1]->x, even);
+		fp2_copy_sec(r->y, q[1]->y, even);
+		fp2_copy_sec(r->z, q[1]->z, even);
 
 		/* Convert r to affine coordinates. */
 		ep2_norm(r, r);
@@ -219,13 +221,12 @@ static void ep2_mul_reg_gls(ep2_t r, const ep2_t p, const bn_t k) {
 	RLC_FINALLY {
 		bn_free(n);
 		bn_free(u);
-		ep2_free(q);
-		ep2_free(w);
 		for (int i = 0; i < 4; i++) {
 			bn_free(_k[i]);
-			for (size_t j = 0; j < (1 << (RLC_WIDTH - 2)); j++) {
-				ep2_free(t[i][j]);
-			}
+			ep2_free(q[i]);
+		}
+		for (int i = 0; i < (1 << 3); i++) {
+			ep2_free(t[i]);
 		}
 	}
 }
diff --git a/src/fp/relic_fp_smb.c b/src/fp/relic_fp_smb.c
index 0ff9d5266..f9f2602bb 100644
--- a/src/fp/relic_fp_smb.c
+++ b/src/fp/relic_fp_smb.c
@@ -164,24 +164,24 @@ static dig_t porninstep(dis_t m[4],const dig_t f[2], const dig_t g[2],
 
 static dis_t jumpdivstep(dis_t m[4], dig_t *k, dis_t delta, dis_t y, dis_t x, 
 		int s) {
-	dig_t d0, t0, t1, t2, c0, c1, yi, ai = 1, bi = 0, ci = 0, di = 1, u = 0;
+	dig_t t0, t1, t2, c0, c1, yi, ai = 1, bi = 0, ci = 0, di = 1, u = 0;
 
 	/* Unrolling twice makes it faster. */
 	for (s -= 2; s >= 0; s -= 2) {
 		yi = y;
 
-		d0 = (delta >= 0);
+		c0 = delta >> (RLC_DIG - 1);
 		c1 = -(x & 1);
-		c0 = (-d0) & c1;
+		c0 &= c1;
 
-		t0 = (y ^ -d0) + d0;
-		t1 = (ci ^ -d0) + d0;
-		t2 = (di ^ -d0) + d0;
+		t0 = (y ^ c0) - c0;
+		t1 = (ci ^ c0) - c0;
+		t2 = (di ^ c0) - c0;
 		x  += t0 & c1;
 		ai += t1 & c1;
 		bi += t2 & c1;
 
-		/* delta = RLC_SEL(delta + 1, -delta, c0) */
+		/* delta = RLC_SEL(2 + delta, 2 - delta, c0) */
 		y  += x  & c0;
 		ci += ai & c0;
 		di += bi & c0;
@@ -189,25 +189,25 @@ static dis_t jumpdivstep(dis_t m[4], dig_t *k, dis_t delta, dis_t y, dis_t x,
 		x  >>= 1;
 		ci <<= 1;
 		di <<= 1;
-		delta = (delta ^ c0) + 1;
+		delta = (delta ^ c0) - 1;
 
 		u += ((yi & y) ^ (y >> 1)) & 2;
 		u += (u & 1) ^ RLC_SIGN(ci);
 
 		yi = y;
 
-		d0 = (delta >= 0);
+		c0 = delta >> (RLC_DIG - 1);
 		c1 = -(x & 1);
-		c0 = (-d0) & c1;
+		c0 &= c1;
 
-		t0 = (y ^ -d0) + d0;
-		t1 = (ci ^ -d0) + d0;
-		t2 = (di ^ -d0) + d0;
+		t0 = (y ^ c0) - c0;
+		t1 = (ci ^ c0) - c0;
+		t2 = (di ^ c0) - c0;
 		x  += t0 & c1;
 		ai += t1 & c1;
 		bi += t2 & c1;
 
-		/* delta = RLC_SEL(delta + 1, -delta, c0) */
+		/* delta = RLC_SEL(2 + delta, 2 - delta, c0) */
 		y  += x  & c0;
 		ci += ai & c0;
 		di += bi & c0;
@@ -215,7 +215,7 @@ static dis_t jumpdivstep(dis_t m[4], dig_t *k, dis_t delta, dis_t y, dis_t x,
 		x  >>= 1;
 		ci <<= 1;
 		di <<= 1;
-		delta = (delta ^ c0) + 1;
+		delta = (delta ^ c0) - 1;
 
 		u += ((yi & y) ^ (y >> 1)) & 2;
 		u += (u & 1) ^ RLC_SIGN(ci);
@@ -468,7 +468,7 @@ int fp_smb_divst(const fp_t a) {
 #if FP_SMB == JMPDS || !defined(STRIP)
 
 int fp_smb_jmpds(const fp_t a) {
-	dis_t m[4], d = 0;
+	dis_t m[4], d = -1;
 	/* Iterations taken directly from https://github.com/sipa/safegcd-bounds */
 	const int iterations = (45907 * FP_PRIME + 26313) / 19929;
 	int loops, precision, i, r = 0, s = RLC_DIG - 2;
diff --git a/test/test_bn.c b/test/test_bn.c
index 4e2f7f574..5ac90f83b 100644
--- a/test/test_bn.c
+++ b/test/test_bn.c
@@ -2281,6 +2281,29 @@ static int recoding(void) {
 					bn_cmp(a, v2[2]) == RLC_EQ, end);
 			}
 		} TEST_END;
+
+		TEST_CASE("glv-sac recoding is correct") {
+			size_t l = RLC_BN_BITS;
+			int8_t ptr[2 * RLC_BN_BITS] = { 0 };
+			if (ep_param_set_any_endom() == RLC_OK) {
+				ep_curve_get_v1(v1);
+				ep_curve_get_v2(v2);
+				ep_curve_get_ord(b);
+				bn_rand_mod(a, b);
+				bn_rec_glv(b, c, a, b, (const bn_t *)v1, (const bn_t *)v2);
+				ep_curve_get_ord(v2[0]);
+				bn_rec_sac(ptr, &l, v1, 2, v2[0]);
+				if (bn_is_even(b)) {
+					bn_add_dig(b, b, 1);
+				}
+				bn_copy(v1[0], b);
+				bn_copy(v1[1], c);
+				for (size_t i = 0; i < l; i++) {
+					TEST_ASSERT(ptr[i] == 0 || ptr[i] == 1, end);
+					TEST_ASSERT(ptr[l + i] == 0 || ptr[l + i] == 1, end);
+				}
+			}
+		} TEST_END;
 #endif /* WITH_EP && EP_ENDOM */
 	}
 	RLC_CATCH_ANY {