Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: "safegcd" field and scalar inversion #767

Closed
wants to merge 34 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
9fbe485
"safegcd" field and scalar inversion
peterdettman Jul 15, 2020
4fab082
Fix secp256k1_scalar_is_even/scalar_low issue
peterdettman Jul 15, 2020
0b90a57
TODOs and comments
peterdettman Jul 16, 2020
0c3869a
VERIFY_CHECK _divsteps_62 loop invariant
peterdettman Jul 18, 2020
11b525c
More checks and comments
peterdettman Jul 21, 2020
3ae7179
Update f,g at full length until proper analysis
peterdettman Jul 21, 2020
2f643ad
Initial 32bit safegcd
peterdettman Jul 21, 2020
b29e51e
Minor cleanup
peterdettman Jul 27, 2020
3519dcc
Initial _inv_var implementations
peterdettman Jul 27, 2020
bd18471
Simplify type of 'eta'
peterdettman Jul 31, 2020
b8c7390
field_5x52: update Bézout coefficients on-the-fly
peterdettman Aug 8, 2020
64a4912
field_10x26: update Bézout coefficients on-the-fly
peterdettman Aug 8, 2020
e5f2d29
scalar_4x64: update Bézout coefficients on-the-fly
peterdettman Aug 8, 2020
34bec40
scalar_8x32: update Bézout coefficients on-the-fly
peterdettman Aug 8, 2020
bfd7a0f
Alternate var-time divsteps code
peterdettman Aug 9, 2020
f873c3b
Add comments regarding small inputs
peterdettman Aug 9, 2020
17982d8
Avoid left shift of signed values
peterdettman Aug 9, 2020
06d568a
Add alternative to __builtin_ctz intrinsics
peterdettman Aug 9, 2020
16509ca
Write primes in signed-digit form
peterdettman Aug 9, 2020
40c815e
Unify _update_de_ methods
peterdettman Aug 9, 2020
dc58f4f
Redo update_de methods
peterdettman Aug 10, 2020
132c76d
Faster 64bit _inv_var, why not?
peterdettman Aug 11, 2020
2f6dfa2
Get better control over the range of d, e
peterdettman Aug 12, 2020
90743d2
Verify the expected zeros are produced
peterdettman Aug 13, 2020
5de2c83
_inv_var conditional negations
peterdettman Aug 13, 2020
308fd32
Experiment with f,g shortening in inv_var
peterdettman Aug 15, 2020
ff0cf11
f,g shortening for 64bit field
peterdettman Aug 15, 2020
b51a1b5
THIS_IS_FASTER
peterdettman Aug 16, 2020
1baff2c
Accentuate the positive
peterdettman Aug 16, 2020
65550c1
Try 128 byte table of inverses
peterdettman Aug 17, 2020
5ccfc30
Avoid redundant calculation
peterdettman Aug 25, 2020
cbd2d57
Faster const-time divsteps
peterdettman Sep 9, 2020
85da7a9
Rework _update_de I/O bounds
peterdettman Nov 10, 2020
c9b7717
Rework _update_de for 32bit
peterdettman Nov 11, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/field_10x26.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,6 @@ typedef struct {
#define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}
#define SECP256K1_FE_STORAGE_CONST_GET(d) d.n[7], d.n[6], d.n[5], d.n[4],d.n[3], d.n[2], d.n[1], d.n[0]

#define SECP256K1_FE_INV_DEFAULT

#endif /* SECP256K1_FIELD_REPR_H */
376 changes: 376 additions & 0 deletions src/field_5x52_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -498,4 +498,380 @@ static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const se
#endif
}

static const secp256k1_fe SECP256K1_FE_TWO_POW_744 = SECP256K1_FE_CONST(
0x0E90A100UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
0x00000000UL, 0x00000100UL, 0x000B7300UL, 0x1D214200UL
);

static void secp256k1_fe_mul_add(int64_t a0, int64_t a1, int64_t b0, int64_t b1, int64_t c0, int64_t c1, int64_t d0, int64_t d1, int64_t *t) {

/* Each [a0,a1], etc. pair is a 126-bit signed value e.g. a0 + a1 * 2^64.
* This method calculates ([a0,a1] * [c0,c1]) + ([b0,b1] * [d0,d1]), and
* writes the 252-bit signed result to [t[0],t[1],t[2],t[3]].
*/

int64_t z0, z1, z2, z3;
int128_t tt;

tt = (int128_t)a0 * b0
+ (int128_t)c0 * d0;
z0 = (int64_t)tt; tt -= z0; tt >>= 64;

tt += (int128_t)a0 * b1
+ (int128_t)a1 * b0
+ (int128_t)c0 * d1
+ (int128_t)c1 * d0;
z1 = (int64_t)tt; tt -= z1; tt >>= 64;

tt += (int128_t)a1 * b1
+ (int128_t)c1 * d1;
z2 = (int64_t)tt; tt -= z2; tt >>= 64;

z3 = (int64_t)tt;

t[0] = z0; t[1] = z1; t[2] = z2; t[3] = z3;
}

static void secp256k1_fe_combine_1s(int64_t *t) {

int64_t a = t[0], b = t[1], c = t[2], d = t[3],
e = t[4], f = t[5], g = t[6], h = t[7];
int128_t I, J, K, L;

I = (int128_t)e * a + (int128_t)f * c;
J = (int128_t)e * b + (int128_t)f * d;
K = (int128_t)g * a + (int128_t)h * c;
L = (int128_t)g * b + (int128_t)h * d;

a = (int64_t)I; I -= a; I >>= 64; b = (int64_t)I;
c = (int64_t)J; J -= c; J >>= 64; d = (int64_t)J;
e = (int64_t)K; K -= e; K >>= 64; f = (int64_t)K;
g = (int64_t)L; L -= g; L >>= 64; h = (int64_t)L;

t[0] = a; t[1] = b; t[2] = c; t[3] = d;
t[4] = e; t[5] = f; t[6] = g; t[7] = h;
}

static void secp256k1_fe_combine_2s(int64_t *t) {

int64_t a0 = t[ 0], a1 = t[ 1];
int64_t b0 = t[ 2], b1 = t[ 3];
int64_t c0 = t[ 4], c1 = t[ 5];
int64_t d0 = t[ 6], d1 = t[ 7];
int64_t e0 = t[ 8], e1 = t[ 9];
int64_t f0 = t[10], f1 = t[11];
int64_t g0 = t[12], g1 = t[13];
int64_t h0 = t[14], h1 = t[15];

secp256k1_fe_mul_add(e0, e1, a0, a1, f0, f1, c0, c1, &t[0]);
secp256k1_fe_mul_add(e0, e1, b0, b1, f0, f1, d0, d1, &t[4]);
secp256k1_fe_mul_add(g0, g1, a0, a1, h0, h1, c0, c1, &t[8]);
secp256k1_fe_mul_add(g0, g1, b0, b1, h0, h1, d0, d1, &t[12]);
}

static void secp256k1_fe_decode_matrix(secp256k1_fe *r, int64_t *t) {

uint64_t u0, u1, u2, u3, u4;
uint64_t r0, r1, r2, r3, r4;
int128_t cc;

cc = t[0];
u0 = (uint64_t)cc; cc >>= 64;
cc += t[1];
u1 = (uint64_t)cc; cc >>= 64;
cc += t[2];
u2 = (uint64_t)cc; cc >>= 64;
cc += t[3];
u3 = (uint64_t)cc; cc >>= 64;
u4 = (uint64_t)cc;

VERIFY_CHECK(u4 == 0 || u4 == UINT64_MAX);

/* Add twice the field prime in case u4 is non-zero (which represents -2^256). */
r0 = 0xFFFFEFFFFFC2FULL * 2;
r1 = 0xFFFFFFFFFFFFFULL * 2;
r2 = 0xFFFFFFFFFFFFFULL * 2;
r3 = 0xFFFFFFFFFFFFFULL * 2;
r4 = 0x0FFFFFFFFFFFFULL * 2;

r0 += u0 & 0xFFFFFFFFFFFFFULL;
r1 += u0 >> 52 | ((u1 << 12) & 0xFFFFFFFFFFFFFULL);
r2 += u1 >> 40 | ((u2 << 24) & 0xFFFFFFFFFFFFFULL);
r3 += u2 >> 28 | ((u3 << 36) & 0xFFFFFFFFFFFFFULL);
r4 += u3 >> 16 | (u4 << 48);

r->n[0] = r0;
r->n[1] = r1;
r->n[2] = r2;
r->n[3] = r3;
r->n[4] = r4;

#ifdef VERIFY
r->magnitude = 2;
r->normalized = 0;
secp256k1_fe_verify(r);
#endif
}

static void secp256k1_fe_encode_62(int64_t *r, const secp256k1_fe *a) {

const uint64_t M62 = UINT64_MAX >> 2;
const uint64_t *n = &a->n[0];
uint64_t a0 = n[0], a1 = n[1], a2 = n[2], a3 = n[3], a4 = n[4];

#ifdef VERIFY
VERIFY_CHECK(a->normalized);
#endif

r[0] = (a0 | a1 << 52) & M62;
r[1] = (a1 >> 10 | a2 << 42) & M62;
r[2] = (a2 >> 20 | a3 << 32) & M62;
r[3] = (a3 >> 30 | a4 << 22) & M62;
r[4] = a4 >> 40;
}

static int secp256k1_fe_divsteps_62(uint16_t eta, uint64_t f0, uint64_t g0, int64_t *t) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just one thing I noticed when skimming through it: The types of eta are pretty inconsistent. You use uint16_t inside the function, return an int and then store it in an int16_t.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, it should be cleaned up. It's logically a signed value (~ in [-744,744]) in the sense that the algorithm as-written switches on the sign of it, but the actual constant-time code for _divsteps is in "bit-twiddling" style, and I wanted to make it clear that there's no reliance (in this method) on arithmetic right-shift. Conceptually maybe it's (u)int_fast16_t, but I was a bit lazy about dealing with the variable sizes before things worked. Possibly uint64_t (resp. uint32_t for 32bit) makes more sense.


uint64_t u = -(uint64_t)1, v = 0, q = 0, r = -(uint64_t)1;
uint64_t c1, c2, f = f0, g = g0, x, y, z;
int i;

for (i = 0; i < 62; ++i) {

VERIFY_CHECK((u * f0 + v * g0) == -f << i);
VERIFY_CHECK((q * f0 + r * g0) == -g << i);

c1 = -(g & (eta >> 15));

x = (f ^ g) & c1;
f ^= x; g ^= x; g ^= c1; g -= c1;

y = (u ^ q) & c1;
u ^= y; q ^= y; q ^= c1; q -= c1;

z = (v ^ r) & c1;
v ^= z; r ^= z; r ^= c1; r -= c1;

eta = (eta ^ (uint16_t)c1) - (uint16_t)c1 - 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On a related note, seeing integer types smaller than int makes me somewhat nervous. Most people looking at this line for example won't notice that all the arithmetic here is performed on (signed) int due to integer promotion rules. (I believe that's not an issue in this line, it's just a general remark.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I just missing a U i.e. 1U or else could you explain?

Copy link
Contributor

@real-or-random real-or-random Jul 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way integer operations (and this includes bitwise as well as unary operators, shifts and comparisons) is done in C can be pretty unexpected for types smaller than int: Before any other rules apply, "Integer types smaller than int are promoted when an operation is performed on them. If all values of the original type can be represented as an int, the value of the smaller type is converted to an int; otherwise, it is converted to an unsigned int" [1]. On our targets, int is the same as int32_t, so an uint16_t for sure fits in an int. The possibility that an unsigned integer is converted to a signed integer is fucked up because signed overflow is undefined behavior. And compilers have been reported to rely on this [2, 3].

For this line here, this means eta ^ (uint16_t)c1 is evaluated by first converting eta and (uint16_t)c1 to int and then performing the ^. Then for the -, the right operand (uint16_t)c1 is converted to int. In those cases, it's easy to see that it doesn't make a difference, at least for our targets.

In general, if you ignore the theoretical cases like int 17 is bits wide, it seems (without a proof) that the only interesting operations are multiplication, left shifts, right shifts, and comparisons, the former two because of possible overflows, comparisons because of issues with comparing signed and unsigned values, and right shifts because it's relevant whether the left operand is signed or unsigned.

The real fuckup here is that the stdint.h types were supposed to be used for portable arithmetic but portable arithmetic is not possible in C because all integer arithmetic, no matter what type, implicitly depends on the size of int by the above rule. Our code would probably blow up on a target where int is 64 bits. (Yes, this exists.)

The typical trick to work around this is to start the arithmetic with an unsigned int value, e.g., (1U * eta) ^ (uint16_t)c1. No need to point out that this is ugly.

[1] https://wiki.sei.cmu.edu/confluence/display/c/INT02-C.+Understand+integer+conversion+rules
[2] https://stackoverflow.com/questions/24795651/whats-the-best-c-way-to-multiply-unsigned-integers-modularly-safely the link is for C++ but the same rules apply in this case
[3] https://www.cryptologie.net/article/418/integer-promotion-in-c/


c2 = -(g & 1);

g += (f & c2); g >>= 1;
q += (u & c2); u <<= 1;
r += (v & c2); v <<= 1;
}

t[0] = (int64_t)u;
t[1] = (int64_t)v;
t[2] = (int64_t)q;
t[3] = (int64_t)r;

return eta;
}

static void secp256k1_fe_update_fg(int len, int64_t *f, int64_t *g, int64_t *t) {

const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
int64_t u = t[0], v = t[1], q = t[2], r = t[3], fi, gi;
int128_t cf = 0, cg = 0;
int i;

VERIFY_CHECK(len > 0);

fi = f[0];
gi = g[0];

cf -= (int128_t)u * fi + (int128_t)v * gi;
cg -= (int128_t)q * fi + (int128_t)r * gi;

VERIFY_CHECK(((int64_t)cf & M62) == 0);
VERIFY_CHECK(((int64_t)cg & M62) == 0);

cf >>= 62;
cg >>= 62;

for (i = 1; i < len; ++i) {

fi = f[i];
gi = g[i];

cf -= (int128_t)u * fi + (int128_t)v * gi;
cg -= (int128_t)q * fi + (int128_t)r * gi;

f[i - 1] = (int64_t)cf & M62; cf >>= 62;
g[i - 1] = (int64_t)cg & M62; cg >>= 62;
}

f[i - 1] = (int64_t)cf;
g[i - 1] = (int64_t)cg;
}

static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {

#if 1

/* TODO Check for a == 0? */

int64_t t[12 * 4];
int64_t f[5] = { 0x3FFFFFFEFFFFFC2FLL, 0x3FFFFFFFFFFFFFFFLL, 0x3FFFFFFFFFFFFFFFLL,
0x3FFFFFFFFFFFFFFFLL, 0xFFLL };
int64_t g[5];
secp256k1_fe b0, d0, a1, b1, c1, d1;
int i, len, sign;
int16_t eta;

/* TODO 2^256 (mod p) is small, so it could be faster to multiply the input
* by 2^768, and then the output by 2^24. */
/* Instead of dividing the output by 2^744, we scale the input. */
secp256k1_fe_mul(&b0, a, &SECP256K1_FE_TWO_POW_744);
secp256k1_fe_normalize(&b0);
secp256k1_fe_encode_62(&g[0], &b0);

eta = -1;

for (i = 0; i < 12; ++i) {
eta = secp256k1_fe_divsteps_62(eta, f[0], g[0], &t[i * 4]);
len = i <= 6 ? 5 : i >= 10 ? 1 : 11 - i;
secp256k1_fe_update_fg(len, f, g, &t[i * 4]);
}

/* At this point, f must equal +/- 1 (the GCD). */
sign = (f[0] >> 1) & 1;

for (i = 0; i < 3; ++i) {
int tOff = i * 16;
secp256k1_fe_combine_1s(&t[tOff + 0]);
secp256k1_fe_combine_1s(&t[tOff + 8]);
secp256k1_fe_combine_2s(&t[tOff + 0]);
}

/* secp256k1_fe_decode_matrix(&a0, &t[0]); */
secp256k1_fe_decode_matrix(&b0, &t[4]);
/* secp256k1_fe_decode_matrix(&c0, &t[8]); */
secp256k1_fe_decode_matrix(&d0, &t[12]);

secp256k1_fe_decode_matrix(&a1, &t[16]);
secp256k1_fe_decode_matrix(&b1, &t[20]);
secp256k1_fe_decode_matrix(&c1, &t[24]);
secp256k1_fe_decode_matrix(&d1, &t[28]);

secp256k1_fe_mul(&a1, &a1, &b0);
secp256k1_fe_mul(&b1, &b1, &d0);
secp256k1_fe_mul(&c1, &c1, &b0);
secp256k1_fe_mul(&d1, &d1, &d0);

b0 = a1; secp256k1_fe_add(&b0, &b1);
d0 = c1; secp256k1_fe_add(&d0, &d1);

secp256k1_fe_decode_matrix(&a1, &t[32]);
secp256k1_fe_decode_matrix(&b1, &t[36]);
/* secp256k1_fe_decode_matrix(&c1, &t[40]); */
/* secp256k1_fe_decode_matrix(&d1, &t[44]); */

secp256k1_fe_mul(&a1, &a1, &b0);
secp256k1_fe_mul(&b1, &b1, &d0);
/* secp256k1_fe_mul(&c1, &c1, &b0); */
/* secp256k1_fe_mul(&d1, &d1, &d0); */

b0 = a1; secp256k1_fe_add(&b0, &b1);
/* d0 = c1; secp256k1_fe_add(&d0, &d1); */

secp256k1_fe_negate(&b1, &b0, 2);
secp256k1_fe_cmov(&b0, &b1, sign);
secp256k1_fe_normalize_weak(&b0);

*r = b0;

#else

secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
int j;

/** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
* { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
* [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
*/

secp256k1_fe_sqr(&x2, a);
secp256k1_fe_mul(&x2, &x2, a);

secp256k1_fe_sqr(&x3, &x2);
secp256k1_fe_mul(&x3, &x3, a);

x6 = x3;
for (j=0; j<3; j++) {
secp256k1_fe_sqr(&x6, &x6);
}
secp256k1_fe_mul(&x6, &x6, &x3);

x9 = x6;
for (j=0; j<3; j++) {
secp256k1_fe_sqr(&x9, &x9);
}
secp256k1_fe_mul(&x9, &x9, &x3);

x11 = x9;
for (j=0; j<2; j++) {
secp256k1_fe_sqr(&x11, &x11);
}
secp256k1_fe_mul(&x11, &x11, &x2);

x22 = x11;
for (j=0; j<11; j++) {
secp256k1_fe_sqr(&x22, &x22);
}
secp256k1_fe_mul(&x22, &x22, &x11);

x44 = x22;
for (j=0; j<22; j++) {
secp256k1_fe_sqr(&x44, &x44);
}
secp256k1_fe_mul(&x44, &x44, &x22);

x88 = x44;
for (j=0; j<44; j++) {
secp256k1_fe_sqr(&x88, &x88);
}
secp256k1_fe_mul(&x88, &x88, &x44);

x176 = x88;
for (j=0; j<88; j++) {
secp256k1_fe_sqr(&x176, &x176);
}
secp256k1_fe_mul(&x176, &x176, &x88);

x220 = x176;
for (j=0; j<44; j++) {
secp256k1_fe_sqr(&x220, &x220);
}
secp256k1_fe_mul(&x220, &x220, &x44);

x223 = x220;
for (j=0; j<3; j++) {
secp256k1_fe_sqr(&x223, &x223);
}
secp256k1_fe_mul(&x223, &x223, &x3);

/* The final result is then assembled using a sliding window over the blocks. */

t1 = x223;
for (j=0; j<23; j++) {
secp256k1_fe_sqr(&t1, &t1);
}
secp256k1_fe_mul(&t1, &t1, &x22);
for (j=0; j<5; j++) {
secp256k1_fe_sqr(&t1, &t1);
}
secp256k1_fe_mul(&t1, &t1, a);
for (j=0; j<3; j++) {
secp256k1_fe_sqr(&t1, &t1);
}
secp256k1_fe_mul(&t1, &t1, &x2);
for (j=0; j<2; j++) {
secp256k1_fe_sqr(&t1, &t1);
}
secp256k1_fe_mul(r, a, &t1);
#endif
}

#endif /* SECP256K1_FIELD_REPR_IMPL_H */
Loading