diff --git a/include/relic_arch.h b/include/relic_arch.h index f149d4a5d..05ff65e07 100644 --- a/include/relic_arch.h +++ b/include/relic_arch.h @@ -95,6 +95,11 @@ ull_t arch_cycles(void); */ uint_t arch_lzcnt(dig_t); +/** + * Return the number of trailing zeros in an integer. + */ +uint_t arch_tzcnt(dig_t); + #if ARCH == AVR /** diff --git a/include/relic_bn.h b/include/relic_bn.h index 39046483c..bbfa6e740 100644 --- a/include/relic_bn.h +++ b/include/relic_bn.h @@ -1541,6 +1541,18 @@ void bn_rec_glv(bn_t k0, bn_t k1, const bn_t k, const bn_t n, const bn_t v1[], void bn_rec_frb(bn_t *ki, int sub, const bn_t k, const bn_t x, const bn_t n, int cof); +/** + * Recodes subscalars in the signed aligned column representation.. + * + * @param[out] b - the recoded subscalars. + * @param[in] len - the length in bytes of the recoding. + * @param[in] k - the subscalars to recode. + * @param[in] m - the number of subscallars to recode. + * @param[in] n - the elliptic curve group order. + * @throw ERR_NO_BUFFER - if the buffer capacity is insufficient. + */ +void bn_rec_sac(int8_t *b, size_t *len, bn_t *k, size_t m, bn_t n); + /** * Computes the coefficients of the polynomial representing the Lagrange * interpolation for a modulus and a given set of roots. diff --git a/include/relic_core.h b/include/relic_core.h index 13805aeb5..6941e014c 100644 --- a/include/relic_core.h +++ b/include/relic_core.h @@ -497,8 +497,10 @@ typedef struct _ctx_t { /** Function pointer to underlying lznct implementation. */ #if ARCH == X86 unsigned int (*lzcnt_ptr)(dig_t); + unsigned int (*tzcnt_ptr)(dig_t); #elif ARCH == X64 || ARCH == A64 unsigned int (*lzcnt_ptr)(ull_t); + unsigned int (*tzcnt_ptr)(ull_t); #endif } ctx_t; diff --git a/src/arch/relic_arch_a64.c b/src/arch/relic_arch_a64.c index f4dd6277a..1c4f1bd72 100644 --- a/src/arch/relic_arch_a64.c +++ b/src/arch/relic_arch_a64.c @@ -36,6 +36,7 @@ #include "relic_core.h" #include "lzcnt.inc" +#include "tzcnt.inc" /** * Renames the inline assembly macro to a prettier name. @@ -177,6 +178,8 @@ void arch_init(void) { if (ctx != NULL) { core_get()->lzcnt_ptr = (has_lzcnt_hard() ? lzcnt64_hard : lzcnt64_soft); + core_get()->tzcnt_ptr = + (has_lzcnt_hard() ? tzcnt64_hard : tzcnt64_soft); } #if TIMER == CYCLE @@ -199,6 +202,7 @@ void arch_clean(void) { ctx_t *ctx = core_get(); if (ctx != NULL) { core_get()->lzcnt_ptr = NULL; + core_get()->tzcnt_ptr = NULL; } } @@ -234,3 +238,7 @@ ull_t arch_cycles(void) { uint_t arch_lzcnt(dig_t x) { return core_get()->lzcnt_ptr((ull_t)x) - (8 * sizeof(ull_t) - WSIZE); } + +uint_t arch_tzcnt(dig_t x) { + return core_get()->tzcnt_ptr(x); +} diff --git a/src/arch/relic_arch_arm.c b/src/arch/relic_arch_arm.c index d6e662676..e079042b4 100644 --- a/src/arch/relic_arch_arm.c +++ b/src/arch/relic_arch_arm.c @@ -32,6 +32,7 @@ #include "relic_types.h" #include "lzcnt.inc" +#include "tzcnt.inc" /** * Renames the inline assembly macro to a prettier name. @@ -111,3 +112,11 @@ uint_t arch_lzcnt(uint_t x) { return lzcnt64_gcc_arm(x); #endif } + +uint_t arch_tzcnt(uint_t x) { +#ifdef WSIZE == 32 + return tzcnt32_gcc_arm(x); +#elif WSIZE == 64 + return tzcnt64_gcc_arm(x); +#endif +} diff --git a/src/arch/relic_arch_avr.c b/src/arch/relic_arch_avr.c index 5ca50b16b..220045941 100644 --- a/src/arch/relic_arch_avr.c +++ b/src/arch/relic_arch_avr.c @@ -62,3 +62,15 @@ uint_t arch_lzcnt() { } return 0; } + +uint_t arch_tzcnt() { + static const uint8_t table[16] = { + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 + }; + if (a >> 4 != 0) { + return table[a & 0xF]; + } else { + return table[a >> 4] + 4; + } + return 0; +} diff --git a/src/arch/relic_arch_msp.c b/src/arch/relic_arch_msp.c index db2545492..314cb68d8 100644 --- a/src/arch/relic_arch_msp.c +++ b/src/arch/relic_arch_msp.c @@ -119,3 +119,32 @@ uint_t arch_lzcnt() { return 0; #endif } + +uint_t arch_tzcnt() { + static const uint8_t table[16] = { + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 + }; +#if WSIZE == 8 + if (a >> 4 != 0) { + return table[a & 0xF]; + } else { + return table[a >> 4] + 4; + } + return 0; +#elif WSIZE == 16 + int offset; + + if (a & 0xFF == 0) { + offset = 8; + } else { + offset = 0; + } + a = a >> offset; + if (a >> 4 != 0) { + return table[a & 0xF] + offset; + } else { + return table[a >> 4] + 4 + offset; + } + return 0; +#endif +} diff --git a/src/arch/relic_arch_none.c b/src/arch/relic_arch_none.c index 9063cf574..98f57e744 100644 --- a/src/arch/relic_arch_none.c +++ b/src/arch/relic_arch_none.c @@ -83,7 +83,50 @@ uint_t arch_lzcnt(dig_t a) { #ifdef _MSC_VER return __lzcnt64(a); #else - return __builtin_clzll(a); + return __builtin_clzl(a); +#endif +#endif +} + +uint_t arch_tzcnt(dig_t a) { +#if WSIZE == 8 || WSIZE == 16 + static const uint8_t table[16] = { + 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 + }; +#endif +#if WSIZE == 8 + if (a >> 4 != 0) { + return table[a & 0xF]; + } else { + return table[a >> 4] + 4; + } + return 0; +#elif WSIZE == 16 + int offset; + + if (a & 0xFF == 0) { + offset = 8; + } else { + offset = 0; + } + a = a >> offset; + if (a >> 4 != 0) { + return table[a & 0xF] + offset; + } else { + return table[a >> 4] + 4 + offset; + } + return 0; +#elif WSIZE == 32 +#ifdef _MSC_VER + return __tzcnt(a); +#else + return __builtin_ctz(a); +#endif +#elif WSIZE == 64 +#ifdef _MSC_VER + return __tzcnt64(a); +#else + return __builtin_ctzl(a); #endif #endif } diff --git a/src/arch/relic_arch_x64.c b/src/arch/relic_arch_x64.c index 5bef299d8..baebd138a 100644 --- a/src/arch/relic_arch_x64.c +++ b/src/arch/relic_arch_x64.c @@ -36,6 +36,7 @@ #include "relic_core.h" #include "lzcnt.inc" +#include "tzcnt.inc" /** * Renames the inline assembly macro to a prettier name. @@ -51,6 +52,8 @@ void arch_init(void) { if (ctx != NULL) { core_get()->lzcnt_ptr = (has_lzcnt_hard() ? lzcnt64_hard : lzcnt64_soft); + core_get()->tzcnt_ptr = + (has_tzcnt_hard() ? tzcnt64_hard : tzcnt64_soft); } } @@ -58,6 +61,7 @@ void arch_clean(void) { ctx_t *ctx = core_get(); if (ctx != NULL) { core_get()->lzcnt_ptr = NULL; + core_get()->tzcnt_ptr = NULL; } } @@ -103,3 +107,7 @@ ull_t arch_cycles(void) { uint_t arch_lzcnt(dig_t x) { return core_get()->lzcnt_ptr((ull_t)x) - (8 * sizeof(ull_t) - WSIZE); } + +uint_t arch_tzcnt(dig_t x) { + return core_get()->tzcnt_ptr(x); +} diff --git a/src/arch/relic_arch_x86.c b/src/arch/relic_arch_x86.c index fc32b5415..2f0bbac2f 100644 --- a/src/arch/relic_arch_x86.c +++ b/src/arch/relic_arch_x86.c @@ -36,6 +36,7 @@ #include "relic_core.h" #include "lzcnt.inc" +#include "tzcnt.inc" /*============================================================================*/ /* Public definitions */ @@ -43,10 +44,12 @@ void arch_init(void) { core_get()->lzcnt_ptr = (has_lzcnt_hard() ? lzcnt32_hard : lzcnt32_soft); + core_get()->tzcnt_ptr = (has_tzcnt_hard() ? tzcnt32_hard : tzcnt32_soft); } void arch_clean(void) { core_get()->lzcnt_ptr = NULL; + core_get()->tzcnt_ptr = NULL; } ull_t arch_cycles(void) { @@ -64,3 +67,7 @@ ull_t arch_cycles(void) { uint_t arch_lzcnt(dig_t x) { return core_get()->lzcnt_ptr((uint32_t)x) - (8 * sizeof(uint32_t) - WSIZE); } + +uint_t arch_tzcnt(dig_t x) { + return core_get()->tzcnt_ptr(x); +} diff --git a/src/arch/tzcnt.inc b/src/arch/tzcnt.inc new file mode 100644 index 000000000..719761e79 --- /dev/null +++ b/src/arch/tzcnt.inc @@ -0,0 +1,456 @@ +/* +Count trailing zero bits. Choice of public domain or MIT-0. + +David Reid - mackron@gmail.com + +The tzcnt32 and tzcnt64 functions count the number of trailing zero bits in a given 32- or 64-bit variable. When the input variable is 0, the +total size in bits will be returned (32 for tzcnt32 and 64 for tzcnt64). + +For x86/64 platforms, this will use the TZCNT instruction if available. On ARM it will be implemented in terms of the CLZ instruction. If these +are unavailable it will fall back to compiler-specific built-ins. If these are unavailable it'll fall back to the generic implementation. + + +License +======= + +Public Domain (www.unlicense.org) +------------------------------------------------- +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +software, either in source code form or as a compiled binary, for any purpose, +commercial or non-commercial, and by any means. + +In jurisdictions that recognize copyright laws, the author or authors of this +software dedicate any and all copyright interest in the software to the public +domain. We make this dedication for the benefit of the public at large and to +the detriment of our heirs and successors. We intend this dedication to be an +overt act of relinquishment in perpetuity of all present and future rights to +this software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to + + +Functions +--------- +tzcnt32_generic +tzcnt64_generic + Generic implementation. + +tzcnt32_msvc_bsf +tzcnt64_msvc_bsf + MSVC built-in implementation using _BitScanForward/64(). Note that tzcnt64_mscv_bsf() is only available when compiling as 64-bit. + +tzcnt32_gcc_builtin +tzcnt64_gcc_builtin + GCC/Clang built-in implementation using __builtin_ctzl/l(). Note that tzcnt64_gcc_builtin() is only available when compiling as 64-bit. + +tzcnt32_msvc_x86 +tzcnt64_msvc_x64 + MSVC implementation in terms of the __lzcnt and __lzcnt64 intrinsic. Note that these are only available when targeting x86/64. tzcnt64_msvc_x64() + is only available when compiling as 64-bit. + +tzcnt32_gcc_x86 +tzcnt64_gcc_x64 + GCC/Clang inline assembly implementation. This will emit the TZCNT instruction. Note that these are only available when targeting x86/x64 + and when compiled using a compiler that supports GCC style inline assembly. + +tzcnt32_gcc_arm +tzcnt64_gcc_arm + GCC/Clang inline assembly implementation. This will be implemented in terms of the CLZ instruction. Note that these are only available when + targeting ARM architecture version 5 and above and when compiled using a compiler that supports GCC style inline assembly. + +tzcnt32_hard +tzcnt64_hard + High level helper for calling an hardware implementation. This will choose either tzcnt32_msvc_x86()/tzcnt64_msvc_x64() or tzcnt32_gcc_x86()/ + tzcnt64_gcc_x64() depending on the environment. Note that this is only available when targeting x86/64. tzcnt64_hard() is only available + when compiling as 64-bit. You should only call this if has_tzcnt_hard() returns non-zero. + +tzcnt32_soft +tzcnt64_soft + High level helper for calling the best software implementation available for the current build environment. + +tzcnt32 +tzcnt64 + High level helper for calling either a hardware or software implementation depending on the build environment. This will always favor a + hardware implementation. Do not call this in high performance code. The reason for this is that each it will call has_tzcnt_hard() each + time which may be too fine grained for your purposes. You may be better off calling has_tzcnt_hard() once at a higher level. + +has_tzcnt_hard + Determines whether or not a hardware implementation of tzcnt is available. Use this to know whether or not you can call tzcnt32/64_hard(). + Note that this calls CPUID for each, so you may want to cache the result. Use HAS_TZCNT32/64_HARD to check for compile-time support. +*/ +#if defined(_MSC_VER) +#include +#endif + +#if defined(__i386) || defined(_M_IX86) + #define ARCH_X86 +#elif defined(__x86_64__) || defined(_M_X64) + #define ARCH_X64 +#elif (defined(__arm__) && defined(__ARM_ARCH) && __ARM_ARCH >= 5) || (defined(_M_ARM) && _M_ARM >= 5) || defined(__ARM_FEATURE_CLZ) /* ARM (Architecture Version 5) */ + #define ARCH_ARM +#endif + +#if defined(_WIN64) || defined(_LP64) || defined(__LP64__) + #define ARCH_64BIT +#else + #define ARCH_32BIT +#endif + +#if defined(ARCH_X86) || defined(ARCH_X64) + /* x86/64 */ + #if defined(_MSC_VER) && _MSC_VER >= 1500 + #define HAS_TZCNT32_HARD + #if defined(ARCH_64BIT) + #define HAS_TZCNT64_HARD + #endif + #elif defined(__GNUC__) || defined(__clang__) + #define HAS_TZCNT32_HARD + #if defined(ARCH_64BIT) + #define HAS_TZCNT64_HARD + #endif + #endif +#elif defined(ARCH_ARM) + /* ARM */ + #if defined(__GNUC__) || defined(__clang__) + #define HAS_TZCNT32_HARD + #if defined(ARCH_64BIT) + #define HAS_TZCNT64_HARD + #endif + #endif +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1500 && (defined(ARCH_X86) || defined(ARCH_X64)) && !defined(__clang__) + #define HAS_TZCNT_INTRINSIC +#elif (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))) + #define HAS_TZCNT_INTRINSIC +#elif defined(__clang__) + #if defined(__has_builtin) + #if __has_builtin(__builtin_ctzll) || __has_builtin(__builtin_ctzl) + #define HAS_TZCNT_INTRINSIC + #endif + #endif +#endif + +inline unsigned int tzcnt32_generic(unsigned int x) +{ + unsigned int n; + + /* Special case for odd numbers since they should happen about half the time. */ + if (x & 0x1) { + return 0; + } + + if (x == 0) { + return sizeof(x) << 3; + } + + n = 1; + if ((x & 0x0000FFFF) == 0) { x >>= 16; n += 16; } + if ((x & 0x000000FF) == 0) { x >>= 8; n += 8; } + if ((x & 0x0000000F) == 0) { x >>= 4; n += 4; } + if ((x & 0x00000003) == 0) { x >>= 2; n += 2; } + n -= x & 0x00000001; + + return n; +} + +inline unsigned int tzcnt64_generic(unsigned long long x) +{ + unsigned int n; + + /* Special case for odd numbers since they should happen about half the time. */ + if (x & 0x1) { + return 0; + } + + if (x == 0) { + return sizeof(x) << 3; + } + + n = 1; + if ((x & 0xFFFFFFFF) == 0) { x >>= 32; n += 32; } + if ((x & 0x0000FFFF) == 0) { x >>= 16; n += 16; } + if ((x & 0x000000FF) == 0) { x >>= 8; n += 8; } + if ((x & 0x0000000F) == 0) { x >>= 4; n += 4; } + if ((x & 0x00000003) == 0) { x >>= 2; n += 2; } + n -= x & 0x00000001; + + return n; +} + +/* Generic compiler specific intrinsics. */ +#if defined(_MSC_VER) +static unsigned int tzcnt32_msvc_bsf(unsigned int x) +{ + unsigned long n; + + if (x == 0) { + return 32; + } + + _BitScanForward(&n, x); + + return n; +} + +/* _BitScanReverse64() is only available on 64-bit builds. */ +#if defined(ARCH_64BIT) +static unsigned int tzcnt64_msvc_bsf(unsigned long long x) +{ + unsigned long n; + + if (x == 0) { + return 64; + } + + _BitScanForward64(&n, x); + + return n; +} +#endif /* ARCH_64BIT */ +#elif (defined(__GNUC__) || defined(__clang__)) && defined(HAS_TZCNT_INTRINSIC) +static unsigned int tzcnt32_gcc_builtin(unsigned int x) +{ + if (x == 0) { + return 32; + } + + return (unsigned int)__builtin_ctzl((unsigned long)x); +} + +static unsigned int tzcnt64_gcc_builtin(unsigned long long x) +{ + if (x == 0) { + return 64; + } + + return (unsigned int)__builtin_ctzll(x); +} +#endif + +static int has_tzcnt_hard() +{ +#if defined(ARCH_X86) || defined(ARCH_X64) + int info[4] = {0}; + + #if defined(_MSC_VER) + __cpuid(info, 0x80000001); + #elif defined(__GNUC__) || defined(__clang__) + /* + It looks like the -fPIC option uses the ebx register which GCC complains about. We can work around this by just using a different register, the + specific register of which I'm letting the compiler decide on. The "k" prefix is used to specify a 32-bit register. The {...} syntax is for + supporting different assembly dialects. + + What's basically happening is that we're saving and restoring the ebx register manually. + */ + #if defined(ARCH_X86) && defined(__PIC__) + __asm__ __volatile__ ( + "xchg{l} {%%}ebx, %k1;" + "cpuid;" + "xchg{l} {%%}ebx, %k1;" + : "=a"(info[0]), "=&r"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(0x80000001), "c"(0) + ); + #else + __asm__ __volatile__ ( + "cpuid" : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) : "a"(0x80000001), "c"(0) + ); + #endif + #endif + + return (info[2] & (1 << 5)) != 0; +#elif defined(ARCH_ARM) + return 1; /* The CLZ instruction is available starting from ARM architecture version 5. Our ARCH_ARM #define is only defined when targeting version 5 at compile time. */ +#else + return 0; /* Hardware TZCNT is only supported in x86/64 and ARM for now. */ +#endif +} + +/* Intrinsics and inline-assembly. x86/64 has a hardware TZCNT instruction. You can only call these if has_tzcnt_hard() returns true. */ +#if defined(HAS_TZCNT32_HARD) + #if defined(ARCH_X86) || defined(ARCH_X64) + #if defined(_MSC_VER) && !defined(__clang__) + /* Unfortunately no tzcnt instrinsic on MSVC, but we can build it in terms of lzcnt(). */ + static unsigned int tzcnt32_msvc_x86(unsigned int x) + { + if (x == 0) { + return sizeof(x) << 3; + } + + return 31 - __lzcnt(x & -(int)x); + } + #elif defined(__GNUC__) || defined(__clang__) + static unsigned int tzcnt32_gcc_x86(unsigned int x) + { + /* + att: tzcntl [out], [in] + intel: tzcnt [in], [out] + */ + unsigned int r; + __asm__ __volatile__ ( + "tzcnt{l %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc" + ); + + return r; + } + #endif + #endif + #if defined(ARCH_ARM) + #if defined(__GNUC__) || defined(__clang__) + /* The ARM implementation needs to be written in terms of the CLZ instruction. This can probably be optimized by implementing the whole function in assembly. */ + static unsigned int tzcnt32_gcc_arm(unsigned int x) + { + unsigned int r; + + if (x == 0) { + return sizeof(x) << 3; + } + + __asm__ __volatile__ ( + #if defined(ARCH_32BIT) + "clz %[out], %[in]" : [out]"=r"(r) : [in]"r"(x) + #else + "clz %w[out], %w[in]" : [out]"=r"(r) : [in]"r"(x) + #endif + ); + + return 31 - r; + } + #endif + #endif + + static unsigned int tzcnt32_hard(unsigned int x) + { + #if defined(ARCH_X86) || defined(ARCH_X64) + #if defined(_MSC_VER) && !defined(__clang__) + return tzcnt32_msvc_x86(x); + #elif defined(__GNUC__) || defined(__clang__) + return tzcnt32_gcc_x86(x); + #else + #error "This compiler does not support the tzcnt intrinsic." + #endif + #elif defined(ARCH_ARM) + #if defined(__GNUC__) || defined(__clang__) + return tzcnt32_gcc_arm(x); + #else + #error "This compiler does not support the clz intrinsic." + #endif + #else + #error "The build target does not support a native instruction." + #endif + } +#endif + +#if defined(HAS_TZCNT64_HARD) + #if defined(ARCH_X86) || defined(ARCH_X64) + #if defined(_MSC_VER) && !defined(__clang__) + static unsigned int tzcnt64_msvc_x64(unsigned long long x) + { + return _tzcnt_u64(x); + } + #elif defined(__GNUC__) || defined(__clang__) + static unsigned int tzcnt64_gcc_x64(unsigned long long x) + { + /* + att: tzcnt [out], [in] + intel: tzcnt [in], [out] + */ + unsigned long long r; + __asm__ __volatile__ ( + "tzcnt{ %1, %0| %0, %1}" : "=r"(r) : "r"(x) : "cc" + ); + + return r; + } + #endif + #endif + #if defined(ARCH_ARM) + #if defined(__GNUC__) || defined(__clang__) + static unsigned int tzcnt64_gcc_arm(unsigned long long x) + { + return __builtin_ctzl(x); + } + #endif + #endif + + static unsigned int tzcnt64_hard(unsigned long long x) + { + #if defined(ARCH_X64) + #if defined(_MSC_VER) && !defined(__clang__) + return tzcnt64_msvc_x64(x); + #elif defined(__GNUC__) || defined(__clang__) + return tzcnt64_gcc_x64(x); + #else + #error "This compiler does not support the tzcnt intrinsic." + #endif + #elif defined(ARCH_ARM) && defined(ARCH_64BIT) + #if defined(__GNUC__) || defined(__clang__) + return tzcnt64_gcc_arm(x); + #else + #error "This compiler does not support the clz intrinsic." + #endif + #else + #error "The build target does not support a native instruction." + #endif + } +#endif + + +static unsigned int tzcnt32_soft(unsigned int x) +{ +#if defined(_MSC_VER) + return tzcnt32_msvc_bsf(x); +#elif defined(HAS_TZCNT_INTRINSIC) + return tzcnt32_gcc_builtin(x); +#else + return tzcnt32_generic(x); +#endif +} + +static unsigned int tzcnt64_soft(unsigned long long x) +{ +#if defined(ARCH_64BIT) + #if defined(_MSC_VER) + return tzcnt64_msvc_bsf(x); + #elif defined(HAS_TZCNT_INTRINSIC) + return tzcnt64_gcc_builtin(x); + #else + return tzcnt64_generic(x); + #endif +#else + return tzcnt64_generic(x); +#endif +} + + +inline static unsigned int tzcnt32(unsigned int x) +{ +#if defined(HAS_TZCNT32_HARD) + if (has_tzcnt_hard()) { + return tzcnt32_hard(x); + } else +#endif + { + return tzcnt32_soft(x); + } +} + +inline static unsigned int tzcnt64(unsigned int x) +{ +#if defined(HAS_TZCNT64_HARD) + if (has_tzcnt_hard()) { + return tzcnt64_hard(x); + } else +#endif + { + return tzcnt64_soft(x); + } +} diff --git a/src/bn/relic_bn_rec.c b/src/bn/relic_bn_rec.c index d48b63660..2f8b2c210 100644 --- a/src/bn/relic_bn_rec.c +++ b/src/bn/relic_bn_rec.c @@ -876,16 +876,24 @@ void bn_rec_glv(bn_t k0, bn_t k1, const bn_t k, const bn_t n, const bn_t *v1, } } -void bn_rec_sac(bn_t *b, bn_t *k, size_t m, bn_t n) { +void bn_rec_sac(int8_t *b, size_t *len, bn_t *k, size_t m, bn_t n) { /* Assume k0 is the sign-aligner. */ bn_t *t = RLC_ALLOCA(bn_t, m); size_t l = RLC_CEIL(bn_bits(n), m) + 1; + int8_t bji; if (t == NULL) { RLC_THROW(ERR_NO_MEMORY); return; } + if (*len <= l) { + *len = 0; + RLC_FREE(t); + RLC_THROW(ERR_NO_BUFFER); + return; + } + RLC_TRY { for (size_t i = 0; i < m; i++) { bn_null(t[i]); @@ -893,25 +901,32 @@ void bn_rec_sac(bn_t *b, bn_t *k, size_t m, bn_t n) { bn_copy(t[i], k[i]); } - bn_set_bit(b[0], l - 1, 0); + /* The current basis for BN curves might be one bit longer. */ + for (size_t i = 0; i < m; i++) { + l = RLC_MAX(l, bn_bits(k[i]) + 1); + } + + b[l - 1] = 0; for (size_t i = 0; i < l - 1; i++) { - bn_set_bit(b[0], i, 1 - bn_get_bit(k[0], i + 1)); + b[i] = 1 - bn_get_bit(k[0], i + 1); } for (size_t j = 1; j < m; j++) { - for (size_t i = 0; i < l; i++) { - uint8_t bji = bn_get_bit(t[j], 0); - bn_set_bit(b[j], i, bji); + for (size_t i = 0; i < l - 1; i++) { + bji = bn_get_bit(t[j], 0); + b[j * l + i] = bji; bn_hlv(t[j], t[j]); - bn_add_dig(t[j], t[j], bji & bn_get_bit(b[0], i)); + bn_add_dig(t[j], t[j], bji & b[i]); } + b[j * l + l - 1] = bn_get_bit(t[j], 0); } + *len = l; } RLC_CATCH_ANY { RLC_THROW(ERR_CAUGHT); } RLC_FINALLY { for (size_t i = 0; i < m; i++) { bn_free(t[i]); - RLC_FREE(t); } + RLC_FREE(t); } } diff --git a/src/bn/relic_bn_smb.c b/src/bn/relic_bn_smb.c index 98f74dd82..d1446ae0e 100644 --- a/src/bn/relic_bn_smb.c +++ b/src/bn/relic_bn_smb.c @@ -77,14 +77,21 @@ int bn_smb_leg(const bn_t a, const bn_t b) { } int bn_smb_jac(const bn_t a, const bn_t b) { - bn_t t0, t1, r; - int t, h, res; + dis_t ai, bi, ci, di; + dig_t n, d, t; + bn_t t0, t1, t2, t3; + uint_t z, i, s = (RLC_DIG >> 1) - 2; + int r; bn_null(t0); bn_null(t1); - bn_null(r); + bn_null(t2); + bn_null(t3); - /* Argument b must be odd. */ + /* Optimized Pornin's Algorithm by Aleksei Vambol from + * https://github.com/privacy-scaling-explorations/halo2curves/pull/95 */ + + /* Argument b must be odd for Jacobi symbol. */ if (bn_is_even(b) || bn_sign(b) == RLC_NEG) { RLC_THROW(ERR_NO_VALID); return 0; @@ -93,55 +100,119 @@ int bn_smb_jac(const bn_t a, const bn_t b) { RLC_TRY { bn_new(t0); bn_new(t1); - bn_new(r); - t = 1; + bn_new(t2); + bn_new(t3); - if (bn_sign(a) == RLC_NEG) { - bn_add(t0, a, b); - } else { - bn_copy(t0, a); - } + bn_mod(t0, a, b); bn_copy(t1, b); + t = 0; while (1) { - /* t0 = a mod b. */ - bn_mod(t0, t0, t1); - /* If a = 0 then if n = 1 return t else return 0. */ - if (bn_is_zero(t0)) { - if (bn_cmp_dig(t1, 1) == RLC_EQ) { - res = 1; - if (t == -1) { - res = -1; + ai = di = 1; + bi = ci = 0; + + i = RLC_MAX(t0->used, t1->used); + dv_zero(t0->dp + t0->used, i - t0->used); + dv_zero(t1->dp + t1->used, i - t1->used); + if (i == 1) { + n = t0->dp[0]; + d = t1->dp[0]; + while (n != 0) { + if (n & 1) { + if (n < d) { + RLC_SWAP(n, d); + t ^= (n & d); + } + n = (n - d) >> 1; + t ^= d ^ (d >> 1); + } else { + z = arch_tzcnt(n); + t ^= (d ^ (d >> 1)) & (z << 1); + n >>= z; + } + } + r = (d == 1 ? 1 - (t & 2) : 0); + break; + } + + z = RLC_MIN(arch_lzcnt(t0->dp[i - 1]), arch_lzcnt(t1->dp[i - 1])); + n = t0->dp[i - 1] << z; + d = t1->dp[i - 1] << z; + if (z > (RLC_DIG >> 1)) { + n |= t0->dp[i - 2] >> z; + d |= t1->dp[i - 2] >> z; + } + n = (n & RLC_HMASK) | (t0->dp[0] & RLC_LMASK); + d = (d & RLC_HMASK) | (t1->dp[0] & RLC_LMASK); + + i = s; + while (i > 0) { + if (n & 1) { + if (n < d) { + RLC_SWAP(ai, ci); + RLC_SWAP(bi, di); + RLC_SWAP(n, d); + t ^= (n & d); } - break; + n = (n - d) >> 1; + ai = ai - ci; + bi = bi - di; + ci += ci; + di += di; + t ^= d ^ (d >> 1); + i -= 1; } else { - res = 0; - break; + z = RLC_MIN(i, arch_tzcnt(n)); + t ^= (d ^ (d >> 1)) & (z << 1); + ci = (dig_t)ci << z; + di = (dig_t)di << z; + n >>= z; + i -= z; } } - /* Write t0 as 2^h * t0. */ - h = 0; - while (bn_is_even(t0) && !bn_is_zero(t0)) { - h++; - bn_rsh(t0, t0, 1); + + if (ai < 0) { + bn_mul_dig(t2, t0, -ai); + bn_neg(t2, t2); + } else { + bn_mul_dig(t2, t0, ai); } - /* If h != 0 (mod 2) and n != +-1 (mod 8) then t = -t. */ - bn_mod_2b(r, t1, 3); - if ((h % 2 != 0) && (bn_cmp_dig(r, 1) != RLC_EQ) && - (bn_cmp_dig(r, 7) != RLC_EQ)) { - t = -t; + if (bi < 0) { + bn_mul_dig(t3, t1, -bi); + bn_neg(t3, t3); + } else { + bn_mul_dig(t3, t1, bi); } - /* If t0 != 1 (mod 4) and n != 1 (mod 4) then t = -t. */ - bn_mod_2b(r, t0, 2); - if (bn_cmp_dig(r, 1) != RLC_EQ) { - bn_mod_2b(r, t1, 2); - if (bn_cmp_dig(r, 1) != RLC_EQ) { - t = -t; - } + bn_add(t3, t3, t2); + + if (ci < 0) { + bn_mul_dig(t2, t0, -ci); + bn_neg(t2, t2); + } else { + bn_mul_dig(t2, t0, ci); + } + if (di < 0) { + bn_mul_dig(t1, t1, -di); + bn_neg(t1, t1); + } else { + bn_mul_dig(t1, t1, di); + } + bn_add(t1, t1, t2); + bn_rsh(t1, t1, s); + bn_rsh(t0, t3, s); + + if (bn_is_zero(t0)) { + r = (bn_cmp_dig(t1, 1) == RLC_EQ ? 1 - (t & 2) : 0); + break; + } + + if (bn_sign(t0) == RLC_NEG) { + t ^= t1->dp[0]; + bn_neg(t0, t0); + } + if (bn_sign(t1) == RLC_NEG) { + bn_neg(t1, t1); } - bn_copy(r, t0); - bn_copy(t0, t1); - bn_copy(t1, r); } } RLC_CATCH_ANY { @@ -150,8 +221,9 @@ int bn_smb_jac(const bn_t a, const bn_t b) { RLC_FINALLY { bn_free(t0); bn_free(t1); - bn_free(r); + bn_free(t2); + bn_free(t3); } - return res; + return r; } diff --git a/src/ep/relic_ep_map.c b/src/ep/relic_ep_map.c index fb923a61e..c96b78795 100644 --- a/src/ep/relic_ep_map.c +++ b/src/ep/relic_ep_map.c @@ -258,6 +258,7 @@ void ep_map_swift(ep_t p, const uint8_t *msg, size_t len) { bn_null(k); fp_null(v); fp_null(w); + fp_null(y); fp_null(t1); fp_null(t2); fp_null(x1); @@ -271,6 +272,7 @@ void ep_map_swift(ep_t p, const uint8_t *msg, size_t len) { bn_new(k); fp_new(v); fp_new(w); + fp_new(y); fp_new(t1); fp_new(t2); fp_new(x1); @@ -478,6 +480,7 @@ void ep_map_swift(ep_t p, const uint8_t *msg, size_t len) { bn_free(k); fp_free(v); fp_free(w); + fp_free(y); fp_free(t1); fp_free(t2); fp_free(x1); @@ -488,8 +491,7 @@ void ep_map_swift(ep_t p, const uint8_t *msg, size_t len) { fp_free(d[2]); RLC_FREE(pseudo_random_bytes); for (size_t i = 0; i < 8; i++) { - fp_null(h[i]); - fp_new(h[i]); + fp_free(h[i]); } } } diff --git a/src/epx/relic_ep2_map.c b/src/epx/relic_ep2_map.c index a99998f8b..447050d53 100644 --- a/src/epx/relic_ep2_map.c +++ b/src/epx/relic_ep2_map.c @@ -516,6 +516,12 @@ void ep2_map_swift(ep2_t p, const uint8_t *msg, size_t len) { } RLC_FINALLY { bn_free(k); + fp2_free(a); + fp2_free(b); + fp2_free(c); + fp2_free(d); + fp2_free(e); + fp2_free(f); fp2_free(t); fp2_free(u); fp2_free(v); diff --git a/src/epx/relic_ep2_mul.c b/src/epx/relic_ep2_mul.c index 68904d8f5..2e2386cbf 100644 --- a/src/epx/relic_ep2_mul.c +++ b/src/epx/relic_ep2_mul.c @@ -115,100 +115,102 @@ static void ep2_mul_gls_imp(ep2_t r, const ep2_t p, const bn_t k) { #if EP_MUL == LWREG || !defined(STRIP) static void ep2_mul_reg_gls(ep2_t r, const ep2_t p, const bn_t k) { - int8_t reg[4][RLC_FP_BITS + 1], b[4], s[4], c0, n0; - ep2_t q, w, t[4][1 << (RLC_WIDTH - 2)]; + size_t l; bn_t n, _k[4], u; - size_t l, len, _l[4]; + int8_t even, col, sac[4 * (RLC_FP_BITS + 1)]; + ep2_t q[4], t[1 << 3]; bn_null(n); bn_null(u); - ep2_null(q); - ep2_null(w); RLC_TRY { bn_new(n); bn_new(u); - ep2_new(q); - ep2_new(w); - for (size_t i = 0; i < 4; i++) { + for (int i = 0; i < 4; i++) { bn_null(_k[i]); + ep2_null(q[i]); bn_new(_k[i]); - for (size_t j = 0; j < (1 << (RLC_WIDTH - 2)); j++) { - ep2_null(t[i][j]); - ep2_new(t[i][j]); - } + ep2_new(q[i]); + } + for (int i = 0; i < (1 << 3); i++) { + ep2_null(t[i]); + ep2_new(t[i]); } ep2_curve_get_ord(n); fp_prime_get_par(u); bn_mod(_k[0], k, n); bn_rec_frb(_k, 4, _k[0], u, n, ep_curve_is_pairf() == EP_BN); - - l = 0; - /* Make some extra room for BN curves that grow subscalars by 1. */ - len = bn_bits(u) + (ep_curve_is_pairf() == EP_BN); - ep2_norm(t[0][0], p); + ep2_norm(q[0], p); + ep2_frb(q[1], q[0], 1); + ep2_frb(q[2], q[1], 1); + ep2_frb(q[3], q[2], 1); for (size_t i = 0; i < 4; i++) { - s[i] = bn_sign(_k[i]); - bn_abs(_k[i], _k[i]); - b[i] = bn_is_even(_k[i]); - _k[i]->dp[0] |= b[i]; - - _l[i] = RLC_FP_BITS + 1; - bn_rec_reg(reg[i], &_l[i], _k[i], len, RLC_WIDTH); - l = RLC_MAX(l, _l[i]); - - /* Apply Frobenius before flipping sign to build table. */ - if (i > 0) { - ep2_frb(t[i][0], t[i - 1][0], 1); - } + ep2_neg(r, q[i]); + fp2_copy_sec(q[i]->y, r->y, bn_sign(_k[i]) == RLC_NEG); + _k[i]->sign = RLC_POS; } + even = bn_is_even(_k[0]); + bn_add_dig(_k[0], _k[0], even); - for (size_t i = 0; i < 4; i++) { - ep2_neg(q, t[i][0]); - fp2_copy_sec(q->y, t[i][0]->y, s[i] == RLC_POS); - ep2_tab(t[i], q, RLC_WIDTH); + ep2_copy(t[0], q[0]); + for (size_t i = 1; i < (1 << 3); i++) { + l = util_bits_dig(i); + ep2_add(t[i], t[i ^ (1 << (l - 1))], q[l]); } + l = RLC_FP_BITS + 1; + bn_rec_sac(sac, &l, _k, 4, n); + #if defined(EP_MIXED) - fp2_set_dig(w->z, 1); - w->coord = BASIC; + ep2_norm_sim(t + 1, t + 1, (1 << 3) - 1); + fp2_set_dig(r->z, 1); + fp2_set_dig(q[1]->z, 1); + r->coord = q[1]->coord = BASIC; #else - w->coord = = EP_ADD; + r->coord = q[1]->coord = EP_ADD; #endif - ep2_set_infty(r); - for (int j = l - 1; j >= 0; j--) { - for (size_t i = 0; i < RLC_WIDTH - 1; i++) { - ep2_dbl(r, r); - } + col = 0; + for (int i = 3; i > 0; i--) { + col <<= 1; + col += sac[i * l + l - 1]; + } + for (size_t m = 0; m < (1 << 3); m++) { + fp2_copy_sec(r->x, t[m]->x, m == col); + fp2_copy_sec(r->y, t[m]->y, m == col); +#if !defined(EP_MIXED) + fp2_copy_sec(r->z, t[m]->z, m == col); +#endif + } - for (size_t i = 0; i < 4; i++) { - n0 = reg[i][j]; - c0 = (n0 >> 7); - n0 = ((n0 ^ c0) - c0) >> 1; - - for (size_t m = 0; m < (1 << (RLC_WIDTH - 2)); m++) { - fp2_copy_sec(w->x, t[i][m]->x, m == n0); - fp2_copy_sec(w->y, t[i][m]->y, m == n0); - #if !defined(EP_MIXED) - fp2_copy_sec(w->z, t[i][m]->z, m == n0); - #endif - } + ep2_neg(q[1], r); + fp2_copy_sec(r->y, q[1]->y, sac[l - 1] != 0); + for (int j = l - 2; j >= 0; j--) { + ep2_dbl(r, r); - ep2_neg(q, w); - fp2_copy_sec(q->y, w->y, c0 == 0); - ep2_add(r, r, q); + col = 0; + for (int i = 3; i > 0; i--) { + col <<= 1; + col += sac[i * l + j]; } + + for (size_t m = 0; m < (1 << 3); m++) { + fp2_copy_sec(q[1]->x, t[m]->x, m == col); + fp2_copy_sec(q[1]->y, t[m]->y, m == col); +#if !defined(EP_MIXED) + fp2_copy_sec(q[1]->z, t[m]->z, m == col); +#endif + } + ep2_neg(q[2], q[1]); + fp2_copy_sec(q[1]->y, q[2]->y, sac[j]); + ep2_add(r, r, q[1]); } - for (size_t i = 0; i < 4; i++) { - /* Tables are built with points already negated, so no need here. */ - ep2_sub(q, r, t[i][0]); - fp2_copy_sec(r->x, q->x, b[i]); - fp2_copy_sec(r->y, q->y, b[i]); - fp2_copy_sec(r->z, q->z, b[i]); - } + ep2_sub(q[1], r, q[0]); + fp2_copy_sec(r->x, q[1]->x, even); + fp2_copy_sec(r->y, q[1]->y, even); + fp2_copy_sec(r->z, q[1]->z, even); /* Convert r to affine coordinates. */ ep2_norm(r, r); @@ -219,13 +221,12 @@ static void ep2_mul_reg_gls(ep2_t r, const ep2_t p, const bn_t k) { RLC_FINALLY { bn_free(n); bn_free(u); - ep2_free(q); - ep2_free(w); for (int i = 0; i < 4; i++) { bn_free(_k[i]); - for (size_t j = 0; j < (1 << (RLC_WIDTH - 2)); j++) { - ep2_free(t[i][j]); - } + ep2_free(q[i]); + } + for (int i = 0; i < (1 << 3); i++) { + ep2_free(t[i]); } } } diff --git a/src/fp/relic_fp_smb.c b/src/fp/relic_fp_smb.c index 0ff9d5266..f9f2602bb 100644 --- a/src/fp/relic_fp_smb.c +++ b/src/fp/relic_fp_smb.c @@ -164,24 +164,24 @@ static dig_t porninstep(dis_t m[4],const dig_t f[2], const dig_t g[2], static dis_t jumpdivstep(dis_t m[4], dig_t *k, dis_t delta, dis_t y, dis_t x, int s) { - dig_t d0, t0, t1, t2, c0, c1, yi, ai = 1, bi = 0, ci = 0, di = 1, u = 0; + dig_t t0, t1, t2, c0, c1, yi, ai = 1, bi = 0, ci = 0, di = 1, u = 0; /* Unrolling twice makes it faster. */ for (s -= 2; s >= 0; s -= 2) { yi = y; - d0 = (delta >= 0); + c0 = delta >> (RLC_DIG - 1); c1 = -(x & 1); - c0 = (-d0) & c1; + c0 &= c1; - t0 = (y ^ -d0) + d0; - t1 = (ci ^ -d0) + d0; - t2 = (di ^ -d0) + d0; + t0 = (y ^ c0) - c0; + t1 = (ci ^ c0) - c0; + t2 = (di ^ c0) - c0; x += t0 & c1; ai += t1 & c1; bi += t2 & c1; - /* delta = RLC_SEL(delta + 1, -delta, c0) */ + /* delta = RLC_SEL(2 + delta, 2 - delta, c0) */ y += x & c0; ci += ai & c0; di += bi & c0; @@ -189,25 +189,25 @@ static dis_t jumpdivstep(dis_t m[4], dig_t *k, dis_t delta, dis_t y, dis_t x, x >>= 1; ci <<= 1; di <<= 1; - delta = (delta ^ c0) + 1; + delta = (delta ^ c0) - 1; u += ((yi & y) ^ (y >> 1)) & 2; u += (u & 1) ^ RLC_SIGN(ci); yi = y; - d0 = (delta >= 0); + c0 = delta >> (RLC_DIG - 1); c1 = -(x & 1); - c0 = (-d0) & c1; + c0 &= c1; - t0 = (y ^ -d0) + d0; - t1 = (ci ^ -d0) + d0; - t2 = (di ^ -d0) + d0; + t0 = (y ^ c0) - c0; + t1 = (ci ^ c0) - c0; + t2 = (di ^ c0) - c0; x += t0 & c1; ai += t1 & c1; bi += t2 & c1; - /* delta = RLC_SEL(delta + 1, -delta, c0) */ + /* delta = RLC_SEL(2 + delta, 2 - delta, c0) */ y += x & c0; ci += ai & c0; di += bi & c0; @@ -215,7 +215,7 @@ static dis_t jumpdivstep(dis_t m[4], dig_t *k, dis_t delta, dis_t y, dis_t x, x >>= 1; ci <<= 1; di <<= 1; - delta = (delta ^ c0) + 1; + delta = (delta ^ c0) - 1; u += ((yi & y) ^ (y >> 1)) & 2; u += (u & 1) ^ RLC_SIGN(ci); @@ -468,7 +468,7 @@ int fp_smb_divst(const fp_t a) { #if FP_SMB == JMPDS || !defined(STRIP) int fp_smb_jmpds(const fp_t a) { - dis_t m[4], d = 0; + dis_t m[4], d = -1; /* Iterations taken directly from https://github.com/sipa/safegcd-bounds */ const int iterations = (45907 * FP_PRIME + 26313) / 19929; int loops, precision, i, r = 0, s = RLC_DIG - 2; diff --git a/test/test_bn.c b/test/test_bn.c index 4e2f7f574..5ac90f83b 100644 --- a/test/test_bn.c +++ b/test/test_bn.c @@ -2281,6 +2281,29 @@ static int recoding(void) { bn_cmp(a, v2[2]) == RLC_EQ, end); } } TEST_END; + + TEST_CASE("glv-sac recoding is correct") { + size_t l = RLC_BN_BITS; + int8_t ptr[2 * RLC_BN_BITS] = { 0 }; + if (ep_param_set_any_endom() == RLC_OK) { + ep_curve_get_v1(v1); + ep_curve_get_v2(v2); + ep_curve_get_ord(b); + bn_rand_mod(a, b); + bn_rec_glv(b, c, a, b, (const bn_t *)v1, (const bn_t *)v2); + ep_curve_get_ord(v2[0]); + bn_rec_sac(ptr, &l, v1, 2, v2[0]); + if (bn_is_even(b)) { + bn_add_dig(b, b, 1); + } + bn_copy(v1[0], b); + bn_copy(v1[1], c); + for (size_t i = 0; i < l; i++) { + TEST_ASSERT(ptr[i] == 0 || ptr[i] == 1, end); + TEST_ASSERT(ptr[l + i] == 0 || ptr[l + i] == 1, end); + } + } + } TEST_END; #endif /* WITH_EP && EP_ENDOM */ } RLC_CATCH_ANY {