From dfe442c7173173de5f9079112dda2d609fa02365 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 17 Jan 2023 19:32:55 -0600 Subject: [PATCH 001/200] add c and asm srouce files from blst --- crypto/blst_src/LICENSE | 201 + crypto/blst_src/aggregate.c | 673 +++ crypto/blst_src/asm/add_mod_256-armv8.pl | 412 ++ crypto/blst_src/asm/add_mod_256-x86_64.pl | 547 +++ crypto/blst_src/asm/add_mod_384-armv8.pl | 937 ++++ crypto/blst_src/asm/add_mod_384-x86_64.pl | 1500 ++++++ crypto/blst_src/asm/add_mod_384x384-x86_64.pl | 260 + crypto/blst_src/asm/arm-xlate.pl | 386 ++ .../blst_src/asm/ct_inverse_mod_256-armv8.pl | 586 +++ .../blst_src/asm/ct_inverse_mod_256-x86_64.pl | 837 ++++ .../blst_src/asm/ct_inverse_mod_384-armv8.pl | 610 +++ .../asm/ct_is_square_mod_384-armv8.pl | 401 ++ .../asm/ct_is_square_mod_384-x86_64.pl | 494 ++ .../asm/ctq_inverse_mod_384-x86_64.pl | 886 ++++ .../asm/ctx_inverse_mod_384-x86_64.pl | 995 ++++ crypto/blst_src/asm/div3w-armv8.pl | 122 + crypto/blst_src/asm/div3w-x86_64.pl | 184 + crypto/blst_src/asm/mul_mont_256-armv8.pl | 409 ++ crypto/blst_src/asm/mul_mont_384-armv8.pl | 2015 ++++++++ crypto/blst_src/asm/mulq_mont_256-x86_64.pl | 513 ++ crypto/blst_src/asm/mulq_mont_384-x86_64.pl | 2675 +++++++++++ crypto/blst_src/asm/mulx_mont_256-x86_64.pl | 486 ++ crypto/blst_src/asm/mulx_mont_384-x86_64.pl | 2384 ++++++++++ crypto/blst_src/asm/sha256-armv8.pl | 541 +++ crypto/blst_src/asm/sha256-portable-x86_64.pl | 337 ++ crypto/blst_src/asm/sha256-x86_64.pl | 789 +++ crypto/blst_src/asm/x86_64-xlate.pl | 1781 +++++++ crypto/blst_src/blst_t.hpp | 538 +++ crypto/blst_src/build/assembly.S | 123 + crypto/blst_src/build/bindings_trim.pl | 37 + .../blst_src/build/coff/add_mod_256-armv8.S | 397 ++ .../blst_src/build/coff/add_mod_256-x86_64.s | 911 ++++ .../blst_src/build/coff/add_mod_384-armv8.S | 1056 ++++ .../blst_src/build/coff/add_mod_384-x86_64.s | 2481 ++++++++++ .../build/coff/add_mod_384x384-x86_64.s | 326 ++ .../build/coff/ct_inverse_mod_256-armv8.S | 798 ++++ .../build/coff/ct_inverse_mod_256-x86_64.s | 1209 +++++ .../build/coff/ct_inverse_mod_384-armv8.S | 729 +++ .../build/coff/ct_is_square_mod_384-armv8.S | 334 ++ .../build/coff/ct_is_square_mod_384-x86_64.s | 505 ++ .../build/coff/ctq_inverse_mod_384-x86_64.s | 1221 +++++ .../build/coff/ctx_inverse_mod_384-x86_64.s | 1596 +++++++ crypto/blst_src/build/coff/div3w-armv8.S | 94 + crypto/blst_src/build/coff/div3w-x86_64.s | 140 + .../blst_src/build/coff/mul_mont_256-armv8.S | 474 ++ .../blst_src/build/coff/mul_mont_384-armv8.S | 2424 ++++++++++ .../build/coff/mulq_mont_256-x86_64.s | 872 ++++ .../build/coff/mulq_mont_384-x86_64.s | 4206 ++++++++++++++++ .../build/coff/mulx_mont_256-x86_64.s | 784 +++ .../build/coff/mulx_mont_384-x86_64.s | 3559 ++++++++++++++ crypto/blst_src/build/coff/sha256-armv8.S | 1087 +++++ .../build/coff/sha256-portable-x86_64.s | 1784 +++++++ crypto/blst_src/build/coff/sha256-x86_64.s | 1560 ++++++ crypto/blst_src/build/elf/add_mod_256-armv8.S | 379 ++ .../blst_src/build/elf/add_mod_256-x86_64.s | 572 +++ crypto/blst_src/build/elf/add_mod_384-armv8.S | 1000 ++++ .../blst_src/build/elf/add_mod_384-x86_64.s | 1907 ++++++++ .../build/elf/add_mod_384x384-x86_64.s | 252 + .../build/elf/ct_inverse_mod_256-armv8.S | 784 +++ .../build/elf/ct_inverse_mod_256-x86_64.s | 1185 +++++ .../build/elf/ct_inverse_mod_384-armv8.S | 717 +++ .../build/elf/ct_is_square_mod_384-armv8.S | 324 ++ .../build/elf/ct_is_square_mod_384-x86_64.s | 479 ++ .../build/elf/ctq_inverse_mod_384-x86_64.s | 1195 +++++ .../build/elf/ctx_inverse_mod_384-x86_64.s | 1574 ++++++ crypto/blst_src/build/elf/div3w-armv8.S | 88 + crypto/blst_src/build/elf/div3w-x86_64.s | 123 + .../blst_src/build/elf/mul_mont_256-armv8.S | 464 ++ .../blst_src/build/elf/mul_mont_384-armv8.S | 2372 +++++++++ .../blst_src/build/elf/mulq_mont_256-x86_64.s | 714 +++ .../blst_src/build/elf/mulq_mont_384-x86_64.s | 3620 ++++++++++++++ .../blst_src/build/elf/mulx_mont_256-x86_64.s | 627 +++ .../blst_src/build/elf/mulx_mont_384-x86_64.s | 2968 ++++++++++++ crypto/blst_src/build/elf/sha256-armv8.S | 1077 +++++ .../build/elf/sha256-portable-x86_64.s | 1754 +++++++ crypto/blst_src/build/elf/sha256-x86_64.s | 1446 ++++++ .../blst_src/build/mach-o/add_mod_256-armv8.S | 379 ++ .../build/mach-o/add_mod_256-x86_64.s | 564 +++ .../blst_src/build/mach-o/add_mod_384-armv8.S | 1000 ++++ .../build/mach-o/add_mod_384-x86_64.s | 1899 ++++++++ .../build/mach-o/add_mod_384x384-x86_64.s | 244 + .../build/mach-o/ct_inverse_mod_256-armv8.S | 784 +++ .../build/mach-o/ct_inverse_mod_256-x86_64.s | 1177 +++++ .../build/mach-o/ct_inverse_mod_384-armv8.S | 717 +++ .../build/mach-o/ct_is_square_mod_384-armv8.S | 324 ++ .../mach-o/ct_is_square_mod_384-x86_64.s | 471 ++ .../build/mach-o/ctq_inverse_mod_384-x86_64.s | 1187 +++++ .../build/mach-o/ctx_inverse_mod_384-x86_64.s | 1566 ++++++ crypto/blst_src/build/mach-o/div3w-armv8.S | 88 + crypto/blst_src/build/mach-o/div3w-x86_64.s | 115 + .../build/mach-o/mul_mont_256-armv8.S | 464 ++ .../build/mach-o/mul_mont_384-armv8.S | 2372 +++++++++ .../build/mach-o/mulq_mont_256-x86_64.s | 706 +++ .../build/mach-o/mulq_mont_384-x86_64.s | 3612 ++++++++++++++ .../build/mach-o/mulx_mont_256-x86_64.s | 619 +++ .../build/mach-o/mulx_mont_384-x86_64.s | 2960 ++++++++++++ crypto/blst_src/build/mach-o/sha256-armv8.S | 1077 +++++ .../build/mach-o/sha256-portable-x86_64.s | 1746 +++++++ crypto/blst_src/build/mach-o/sha256-x86_64.s | 1438 ++++++ crypto/blst_src/build/refresh.sh | 49 + .../build/win64/add_mod_256-armv8.asm | 380 ++ .../build/win64/add_mod_256-x86_64.asm | 934 ++++ .../build/win64/add_mod_384-armv8.asm | 1001 ++++ .../build/win64/add_mod_384-x86_64.asm | 2504 ++++++++++ .../build/win64/add_mod_384x384-x86_64.asm | 334 ++ crypto/blst_src/build/win64/blst.def | 217 + .../build/win64/ct_inverse_mod_256-armv8.asm | 785 +++ .../build/win64/ct_inverse_mod_256-x86_64.asm | 1211 +++++ .../build/win64/ct_inverse_mod_384-armv8.asm | 718 +++ .../win64/ct_is_square_mod_384-armv8.asm | 325 ++ .../win64/ct_is_square_mod_384-x86_64.asm | 509 ++ .../win64/ctq_inverse_mod_384-x86_64.asm | 1224 +++++ .../win64/ctx_inverse_mod_384-x86_64.asm | 1597 +++++++ crypto/blst_src/build/win64/div3w-armv8.asm | 89 + crypto/blst_src/build/win64/div3w-x86_64.asm | 152 + crypto/blst_src/build/win64/dll.c | 32 + .../build/win64/mul_mont_256-armv8.asm | 465 ++ .../build/win64/mul_mont_384-armv8.asm | 2373 +++++++++ .../build/win64/mulq_mont_256-x86_64.asm | 884 ++++ .../build/win64/mulq_mont_384-x86_64.asm | 4233 +++++++++++++++++ .../build/win64/mulx_mont_256-x86_64.asm | 796 ++++ .../build/win64/mulx_mont_384-x86_64.asm | 3586 ++++++++++++++ crypto/blst_src/build/win64/sha256-armv8.asm | 1078 +++++ crypto/blst_src/build/win64/sha256-x86_64.asm | 1570 ++++++ crypto/blst_src/bulk_addition.c | 168 + crypto/blst_src/bytes.h | 152 + crypto/blst_src/client_min_pk.c | 17 + crypto/blst_src/client_min_sig.c | 17 + crypto/blst_src/consts.c | 36 + crypto/blst_src/consts.h | 30 + crypto/blst_src/e1.c | 564 +++ crypto/blst_src/e2.c | 638 +++ crypto/blst_src/ec_mult.h | 289 ++ crypto/blst_src/ec_ops.h | 787 +++ crypto/blst_src/errors.h | 19 + crypto/blst_src/exp.c | 55 + crypto/blst_src/exports.c | 559 +++ crypto/blst_src/fields.h | 116 + crypto/blst_src/fp12_tower.c | 789 +++ crypto/blst_src/hash_to_field.c | 177 + crypto/blst_src/keygen.c | 319 ++ crypto/blst_src/map_to_g1.c | 559 +++ crypto/blst_src/map_to_g2.c | 444 ++ crypto/blst_src/multi_scalar.c | 414 ++ crypto/blst_src/no_asm.h | 1345 ++++++ crypto/blst_src/pairing.c | 444 ++ crypto/blst_src/pentaroot-addchain.h | 333 ++ crypto/blst_src/pentaroot.c | 76 + crypto/blst_src/point.h | 62 + crypto/blst_src/rb_tree.c | 145 + crypto/blst_src/recip-addchain.h | 489 ++ crypto/blst_src/recip.c | 139 + crypto/blst_src/server.c | 27 + crypto/blst_src/sha256.h | 140 + crypto/blst_src/sqrt-addchain.h | 489 ++ crypto/blst_src/sqrt.c | 261 + crypto/blst_src/vect.c | 176 + crypto/blst_src/vect.h | 418 ++ 158 files changed, 140075 insertions(+) create mode 100644 crypto/blst_src/LICENSE create mode 100644 crypto/blst_src/aggregate.c create mode 100755 crypto/blst_src/asm/add_mod_256-armv8.pl create mode 100755 crypto/blst_src/asm/add_mod_256-x86_64.pl create mode 100755 crypto/blst_src/asm/add_mod_384-armv8.pl create mode 100755 crypto/blst_src/asm/add_mod_384-x86_64.pl create mode 100755 crypto/blst_src/asm/add_mod_384x384-x86_64.pl create mode 100755 crypto/blst_src/asm/arm-xlate.pl create mode 100755 crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl create mode 100755 crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl create mode 100755 crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl create mode 100755 crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl create mode 100755 crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl create mode 100755 crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl create mode 100755 crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl create mode 100755 crypto/blst_src/asm/div3w-armv8.pl create mode 100755 crypto/blst_src/asm/div3w-x86_64.pl create mode 100755 crypto/blst_src/asm/mul_mont_256-armv8.pl create mode 100755 crypto/blst_src/asm/mul_mont_384-armv8.pl create mode 100755 crypto/blst_src/asm/mulq_mont_256-x86_64.pl create mode 100755 crypto/blst_src/asm/mulq_mont_384-x86_64.pl create mode 100755 crypto/blst_src/asm/mulx_mont_256-x86_64.pl create mode 100755 crypto/blst_src/asm/mulx_mont_384-x86_64.pl create mode 100755 crypto/blst_src/asm/sha256-armv8.pl create mode 100755 crypto/blst_src/asm/sha256-portable-x86_64.pl create mode 100755 crypto/blst_src/asm/sha256-x86_64.pl create mode 100755 crypto/blst_src/asm/x86_64-xlate.pl create mode 100644 crypto/blst_src/blst_t.hpp create mode 100644 crypto/blst_src/build/assembly.S create mode 100755 crypto/blst_src/build/bindings_trim.pl create mode 100644 crypto/blst_src/build/coff/add_mod_256-armv8.S create mode 100644 crypto/blst_src/build/coff/add_mod_256-x86_64.s create mode 100644 crypto/blst_src/build/coff/add_mod_384-armv8.S create mode 100644 crypto/blst_src/build/coff/add_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/coff/add_mod_384x384-x86_64.s create mode 100644 crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S create mode 100644 crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s create mode 100644 crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S create mode 100644 crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S create mode 100644 crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/coff/div3w-armv8.S create mode 100644 crypto/blst_src/build/coff/div3w-x86_64.s create mode 100644 crypto/blst_src/build/coff/mul_mont_256-armv8.S create mode 100644 crypto/blst_src/build/coff/mul_mont_384-armv8.S create mode 100644 crypto/blst_src/build/coff/mulq_mont_256-x86_64.s create mode 100644 crypto/blst_src/build/coff/mulq_mont_384-x86_64.s create mode 100644 crypto/blst_src/build/coff/mulx_mont_256-x86_64.s create mode 100644 crypto/blst_src/build/coff/mulx_mont_384-x86_64.s create mode 100644 crypto/blst_src/build/coff/sha256-armv8.S create mode 100644 crypto/blst_src/build/coff/sha256-portable-x86_64.s create mode 100644 crypto/blst_src/build/coff/sha256-x86_64.s create mode 100644 crypto/blst_src/build/elf/add_mod_256-armv8.S create mode 100644 crypto/blst_src/build/elf/add_mod_256-x86_64.s create mode 100644 crypto/blst_src/build/elf/add_mod_384-armv8.S create mode 100644 crypto/blst_src/build/elf/add_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/elf/add_mod_384x384-x86_64.s create mode 100644 crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S create mode 100644 crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s create mode 100644 crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S create mode 100644 crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S create mode 100644 crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/elf/div3w-armv8.S create mode 100644 crypto/blst_src/build/elf/div3w-x86_64.s create mode 100644 crypto/blst_src/build/elf/mul_mont_256-armv8.S create mode 100644 crypto/blst_src/build/elf/mul_mont_384-armv8.S create mode 100644 crypto/blst_src/build/elf/mulq_mont_256-x86_64.s create mode 100644 crypto/blst_src/build/elf/mulq_mont_384-x86_64.s create mode 100644 crypto/blst_src/build/elf/mulx_mont_256-x86_64.s create mode 100644 crypto/blst_src/build/elf/mulx_mont_384-x86_64.s create mode 100644 crypto/blst_src/build/elf/sha256-armv8.S create mode 100644 crypto/blst_src/build/elf/sha256-portable-x86_64.s create mode 100644 crypto/blst_src/build/elf/sha256-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/add_mod_256-armv8.S create mode 100644 crypto/blst_src/build/mach-o/add_mod_256-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/add_mod_384-armv8.S create mode 100644 crypto/blst_src/build/mach-o/add_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S create mode 100644 crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S create mode 100644 crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S create mode 100644 crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/div3w-armv8.S create mode 100644 crypto/blst_src/build/mach-o/div3w-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/mul_mont_256-armv8.S create mode 100644 crypto/blst_src/build/mach-o/mul_mont_384-armv8.S create mode 100644 crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/sha256-armv8.S create mode 100644 crypto/blst_src/build/mach-o/sha256-portable-x86_64.s create mode 100644 crypto/blst_src/build/mach-o/sha256-x86_64.s create mode 100755 crypto/blst_src/build/refresh.sh create mode 100644 crypto/blst_src/build/win64/add_mod_256-armv8.asm create mode 100644 crypto/blst_src/build/win64/add_mod_256-x86_64.asm create mode 100644 crypto/blst_src/build/win64/add_mod_384-armv8.asm create mode 100644 crypto/blst_src/build/win64/add_mod_384-x86_64.asm create mode 100644 crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm create mode 100644 crypto/blst_src/build/win64/blst.def create mode 100644 crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm create mode 100644 crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm create mode 100644 crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm create mode 100644 crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm create mode 100644 crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm create mode 100644 crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm create mode 100644 crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm create mode 100644 crypto/blst_src/build/win64/div3w-armv8.asm create mode 100644 crypto/blst_src/build/win64/div3w-x86_64.asm create mode 100644 crypto/blst_src/build/win64/dll.c create mode 100644 crypto/blst_src/build/win64/mul_mont_256-armv8.asm create mode 100644 crypto/blst_src/build/win64/mul_mont_384-armv8.asm create mode 100644 crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm create mode 100644 crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm create mode 100644 crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm create mode 100644 crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm create mode 100644 crypto/blst_src/build/win64/sha256-armv8.asm create mode 100644 crypto/blst_src/build/win64/sha256-x86_64.asm create mode 100644 crypto/blst_src/bulk_addition.c create mode 100644 crypto/blst_src/bytes.h create mode 100644 crypto/blst_src/client_min_pk.c create mode 100644 crypto/blst_src/client_min_sig.c create mode 100644 crypto/blst_src/consts.c create mode 100644 crypto/blst_src/consts.h create mode 100644 crypto/blst_src/e1.c create mode 100644 crypto/blst_src/e2.c create mode 100644 crypto/blst_src/ec_mult.h create mode 100644 crypto/blst_src/ec_ops.h create mode 100644 crypto/blst_src/errors.h create mode 100644 crypto/blst_src/exp.c create mode 100644 crypto/blst_src/exports.c create mode 100644 crypto/blst_src/fields.h create mode 100644 crypto/blst_src/fp12_tower.c create mode 100644 crypto/blst_src/hash_to_field.c create mode 100644 crypto/blst_src/keygen.c create mode 100644 crypto/blst_src/map_to_g1.c create mode 100644 crypto/blst_src/map_to_g2.c create mode 100644 crypto/blst_src/multi_scalar.c create mode 100644 crypto/blst_src/no_asm.h create mode 100644 crypto/blst_src/pairing.c create mode 100644 crypto/blst_src/pentaroot-addchain.h create mode 100644 crypto/blst_src/pentaroot.c create mode 100644 crypto/blst_src/point.h create mode 100644 crypto/blst_src/rb_tree.c create mode 100644 crypto/blst_src/recip-addchain.h create mode 100644 crypto/blst_src/recip.c create mode 100644 crypto/blst_src/server.c create mode 100644 crypto/blst_src/sha256.h create mode 100644 crypto/blst_src/sqrt-addchain.h create mode 100644 crypto/blst_src/sqrt.c create mode 100644 crypto/blst_src/vect.c create mode 100644 crypto/blst_src/vect.h diff --git a/crypto/blst_src/LICENSE b/crypto/blst_src/LICENSE new file mode 100644 index 00000000000..261eeb9e9f8 --- /dev/null +++ b/crypto/blst_src/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/crypto/blst_src/aggregate.c b/crypto/blst_src/aggregate.c new file mode 100644 index 00000000000..8a24e0590ba --- /dev/null +++ b/crypto/blst_src/aggregate.c @@ -0,0 +1,673 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Usage pattern on single-processor system is + * + * blst_pairing_init(ctx, hash_or_encode, DST); + * blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, msg[0]); + * blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, msg[1]); + * ... + * blst_pairing_commit(ctx); + * blst_pairing_finalverify(ctx, NULL); + * + *********************************************************************** + * Usage pattern on multi-processor system is + * + * blst_pairing_init(pk[0], hash_or_encode, DST); + * blst_pairing_init(pk[1], hash_or_encode, DST); + * ... + * start threads each processing an N/nthreads slice of PKs and messages: + * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+0], NULL, msg[i*n+0]); + * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+1], NULL, msg[i*n+1]); + * ... + * blst_pairing_commit(pkx); + * ... + * meanwhile in main thread + * blst_fp12 gtsig; + * blst_aggregated_in_g2(>sig, aggregated_signature); + * join threads and merge their contexts: + * blst_pairing_merge(pk[0], pk[1]); + * blst_pairing_merge(pk[0], pk[2]); + * ... + * blst_pairing_finalverify(pk[0], gtsig); + */ + +#ifndef N_MAX +# define N_MAX 8 +#endif + +typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature; +typedef struct { + unsigned int ctrl; + unsigned int nelems; + const void *DST; + size_t DST_len; + vec384fp12 GT; + AggregatedSignature AggrSign; + POINTonE2_affine Q[N_MAX]; + POINTonE1_affine P[N_MAX]; +} PAIRING; + +enum { AGGR_UNDEFINED = 0, + AGGR_MIN_SIG = 1, + AGGR_MIN_PK = 2, + AGGR_SIGN_SET = 0x10, + AGGR_GT_SET = 0x20, + AGGR_HASH_OR_ENCODE = 0x40 }; +#define MIN_SIG_OR_PK (AGGR_MIN_SIG | AGGR_MIN_PK) + +static const size_t sizeof_pairing = (sizeof(PAIRING) + 7) & ~(size_t)7; + +size_t blst_pairing_sizeof(void) +{ return sizeof_pairing; } + +void blst_pairing_init(PAIRING *ctx, int hash_or_encode, + const void *DST, size_t DST_len) +{ + ctx->ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx->nelems = 0; + ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42 + : DST; + ctx->DST_len = DST_len; +} + +static const void *pairing_get_dst(const PAIRING *ctx) +{ return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing + : ctx->DST; +} + +const void *blst_pairing_get_dst(const PAIRING *ctx) +{ return pairing_get_dst(ctx); } + +#define FROM_AFFINE(out,in) do { \ + vec_copy((out)->X, in->X, 2*sizeof(in->X)), \ + vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \ + vec_is_zero(in->X, 2*sizeof(in->X))); } while(0) + +/* + * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated + * signature vetification as discussed at + * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407. + * Usage pattern is not finalized yet, because (sig != NULL) is better and + * will be handled separately... + */ +static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_groupcheck, + const POINTonE1_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_PK) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_SIG; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE1 *S = &ctx->AggrSign.e1; + POINTonE1 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + POINTonE1_mult_w5(P, P, scalar, nbits); + POINTonE1_dadd(S, S, P, NULL); + } else { + POINTonE1_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE1 H[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE2 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(H, H, scalar, nbits); + + POINTonE1_from_Jacobian(H, H); + + n = ctx->nelems; + vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_groupcheck, + const POINTonE2_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_SIG) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_PK; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE2 *S = &ctx->AggrSign.e2; + POINTonE2 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + + POINTonE2_mult_w5(P, P, scalar, nbits); + POINTonE2_dadd(S, S, P, NULL); + } else { + POINTonE2_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE2_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE2 H[1]; + POINTonE1 pk[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE1 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + POINTonE2_from_Jacobian(H, H); + + if (nbits != 0 && scalar != NULL) { + FROM_AFFINE(pk, PK); + POINTonE1_mult_w5(pk, pk, scalar, nbits); + POINTonE1_from_Jacobian(pk, pk); + PK = (const POINTonE1_affine *)pk; + } + + n = ctx->nelems; + vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static void PAIRING_Commit(PAIRING *ctx) +{ + unsigned int n; + + if ((n = ctx->nelems) != 0) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + ctx->nelems = 0; + } +} + +void blst_pairing_commit(PAIRING *ctx) +{ PAIRING_Commit(ctx); } + +BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1) +{ + if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0) + return BLST_AGGR_TYPE_MISMATCH; + + /* context producers are expected to have called blst_pairing_commit */ + if (ctx->nelems || ctx1->nelems) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1, + &ctx1->AggrSign.e1, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1, + sizeof(ctx->AggrSign.e1)); + } + break; + case AGGR_MIN_PK: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2, + &ctx1->AggrSign.e2, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2, + sizeof(ctx->AggrSign.e2)); + } + break; + case AGGR_UNDEFINED: + break; + default: + return BLST_AGGR_TYPE_MISMATCH; + } + + if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) { + mul_fp12(ctx->GT, ctx->GT, ctx1->GT); + } else if (ctx1->ctrl & AGGR_GT_SET) { + ctx->ctrl |= AGGR_GT_SET; + vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT)); + } + + return BLST_SUCCESS; +} + +static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig) +{ + vec384fp12 GT; + + if (!(ctx->ctrl & AGGR_GT_SET)) + return 0; + + if (GTsig != NULL) { + vec_copy(GT, GTsig, sizeof(GT)); + } else if (ctx->ctrl & AGGR_SIGN_SET) { + AggregatedSignature AggrSign; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1); + miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2, + (const POINTonE1_affine *)&AggrSign.e1, 1); + break; + case AGGR_MIN_PK: + POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2); + miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2, + (const POINTonE1_affine *)&BLS12_381_G1, 1); + break; + default: + return 0; + } + } else { + /* + * The aggregated signature was infinite, relation between the + * hashes and the public keys has to be VERY special... + */ + vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT)); + } + + conjugate_fp12(GT); + mul_fp12(GT, GT, ctx->GT); + final_exp(GT, GT); + + /* return GT==1 */ + return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])); +} + +int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig) +{ return (int)PAIRING_FinalVerify(ctx, GTsig); } + +int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2) +{ + vec384fp12 GT; + + vec_copy(GT, GT1, sizeof(GT)); + conjugate_fp12(GT); + mul_fp12(GT, GT, GT2); + final_exp(GT, GT); + + /* return GT==1 */ + return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]))); +} + +void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q, + const POINTonE1_affine *p) +{ + unsigned int n; + + if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p))) + return; + + n = ctx->nelems; + vec_copy(ctx->Q + n, q, sizeof(*q)); + vec_copy(ctx->P + n, p, sizeof(*p)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; +} + +vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx) +{ + PAIRING_Commit(ctx); + return (vec384fp12 *)ctx->GT; +} + +/* + * PAIRING context-free entry points. + * + * To perform FastAggregateVerify, aggregate all public keys and + * signatures with corresponding blst_aggregate_in_g{12}, convert + * result to affine and call suitable blst_core_verify_pk_in_g{12} + * or blst_aggregated_in_g{12}... + */ +BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in, + const unsigned char *zwire) +{ + POINTonE1 P[1]; + BLST_ERROR ret; + + ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE1_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) + vec_copy(out, P, sizeof(P)); + else + POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in, + const unsigned char *zwire) +{ + POINTonE2 P[1]; + BLST_ERROR ret; + + ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE2_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) { + vec_copy(out, P, sizeof(P)); + } else { + POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P); + } + return BLST_SUCCESS; +} + +void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig) +{ miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1); } + +void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig) +{ miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1); } + +BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk, + const POINTonE2_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} + +BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk, + const POINTonE1_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} diff --git a/crypto/blst_src/asm/add_mod_256-armv8.pl b/crypto/blst_src/asm/add_mod_256-armv8.pl new file mode 100755 index 00000000000..34d9145261b --- /dev/null +++ b/crypto/blst_src/asm/add_mod_256-armv8.pl @@ -0,0 +1,412 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); + +@mod=map("x$_",(4..7)); +@a=map("x$_",(8..11)); +@b=map("x$_",(12..15)); +@t=map("x$_",(16,17,1..3)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + adds @a[0],@a[0],@b[0] + ldp @b[2],@b[3],[$b_ptr,#16] + adcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + adcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@t[3],lo + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + + adds @a[0],@b[0],@b[0] + ldp @mod[0],@mod[1],[$b_ptr] + adcs @a[1],@b[1],@b[1] + ldp @mod[2],@mod[3],[$b_ptr,#16] + adcs @a[2],@b[2],@b[2] + adcs @a[3],@b[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + csel @a[3],@a[3],@t[3],lo + + adds @a[0],@a[0],@b[0] + adcs @a[1],@a[1],@b[1] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@t[3],lo + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + +.Loop_lshift_mod_256: + adds @a[0],@a[0],@a[0] + sub $b_ptr,$b_ptr,#1 + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adc @t[4],xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + + cbnz $b_ptr,.Loop_lshift_mod_256 + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + +.Loop_rshift: + adds @b[0],@a[0],@mod[0] + sub $b_ptr,$b_ptr,#1 + adcs @b[1],@a[1],@mod[1] + adcs @b[2],@a[2],@mod[2] + adcs @b[3],@a[3],@mod[3] + adc @t[4],xzr,xzr + tst @a[0],#1 + + csel @b[0],@b[0],@a[0],ne + csel @b[1],@b[1],@a[1],ne + csel @b[2],@b[2],@a[2],ne + csel @b[3],@b[3],@a[3],ne + csel @t[4],@t[4],xzr,ne + + extr @a[0],@b[1],@b[0],#1 + extr @a[1],@b[2],@b[1],#1 + extr @a[2],@b[3],@b[2],#1 + extr @a[3],@t[4],@b[3],#1 + + cbnz $b_ptr,.Loop_rshift + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @mod[0],@mod[1],[$n_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + subs @b[0],@mod[0],@a[0] + ldp @mod[2],@mod[3],[$n_ptr,#16] + orr @mod[0],@a[0],@a[1] + sbcs @b[1],@mod[1],@a[1] + orr @mod[1],@a[2],@a[3] + sbcs @b[2],@mod[2],@a[2] + orr @t[4],@mod[0],@mod[1] + sbc @b[3],@mod[3],@a[3] + + cmp @t[4],#0 + csetm @t[4],ne + ands $b_ptr,$b_ptr,@t[4] + + csel @a[0],@a[0],@b[0],eq + csel @a[1],@a[1],@b[1],eq + csel @a[2],@a[2],@b[2],eq + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@b[3],eq + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + subs @a[0],@a[0],@b[0] + ldp @b[2],@b[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + sbcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + sbcs @a[3],@a[3],@b[3] + sbc @t[4],xzr,xzr + + and @mod[0],@mod[0],@t[4] + and @mod[1],@mod[1],@t[4] + adds @a[0],@a[0],@mod[0] + and @mod[2],@mod[2],@t[4] + adcs @a[1],@a[1],@mod[1] + and @mod[3],@mod[3],@t[4] + adcs @a[2],@a[2],@mod[2] + stp @a[0],@a[1],[$r_ptr] + adc @a[3],@a[3],@mod[3] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + subs xzr,@a[0],@mod[0] + sbcs xzr,@a[1],@mod[1] + orr @a[0],@a[0],@a[1] + sbcs xzr,@a[2],@mod[2] + orr @a[0],@a[0],@a[2] + sbcs xzr,@a[3],@mod[3] + orr @a[0],@a[0],@a[3] + sbc $a_ptr,xzr,xzr + + cmp @a[0],#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,$a_ptr + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @b[0],@b[0] + rev @a[1],@a[1] + rev @b[1],@b[1] + rev @a[2],@a[2] + rev @b[2],@b[2] + rev @a[3],@a[3] + rev @b[3],@b[3] +#endif + + adds @a[0],@a[0],@b[0] + ldp @mod[0],@mod[1],[$n_ptr] + adcs @a[1],@a[1],@b[1] + ldp @mod[2],@mod[3],[$n_ptr,#16] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + csel @a[3],@a[3],@t[3],lo + + orr @t[0], @a[0], @a[1] + orr @t[1], @a[2], @a[3] + orr @t[0], @t[0], @t[1] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + mov @t[1], #1 + cmp @t[0], #0 + csel x0, @t[1], xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @b[0],@b[0] + rev @a[1],@a[1] + rev @b[1],@b[1] + rev @a[2],@a[2] + rev @b[2],@b[2] + rev @a[3],@a[3] + rev @b[3],@b[3] +#endif + + subs @a[0],@a[0],@b[0] + sbcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + sbcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + sbcs @a[3],@a[3],@b[3] + sbc @t[4],xzr,xzr + + and @mod[0],@mod[0],@t[4] + and @mod[1],@mod[1],@t[4] + adds @a[0],@a[0],@mod[0] + and @mod[2],@mod[2],@t[4] + adcs @a[1],@a[1],@mod[1] + and @mod[3],@mod[3],@t[4] + adcs @a[2],@a[2],@mod[2] + adc @a[3],@a[3],@mod[3] + + orr @t[0], @a[0], @a[1] + orr @t[1], @a[2], @a[3] + orr @t[0], @t[0], @t[1] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + mov @t[1], #1 + cmp @t[0], #0 + csel x0, @t[1], xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 +___ + +print $code; + +close STDOUT; diff --git a/crypto/blst_src/asm/add_mod_256-x86_64.pl b/crypto/blst_src/asm/add_mod_256-x86_64.pl new file mode 100755 index 00000000000..1d656fb90bf --- /dev/null +++ b/crypto/blst_src/asm/add_mod_256-x86_64.pl @@ -0,0 +1,547 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits add +my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,\@function,4,"unwind" +.align 32 +add_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loaded_a_add_mod_256: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_256,.-add_mod_256 + +######################################################################## +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,\@function,3,"unwind" +.align 32 +mul_by_3_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org,$n_ptr + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $a_ptr,$b_org + mov 8*3($a_ptr), @acc[3] + + call __lshift_mod_256 + mov 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,\@abi-omnipotent +.align 32 +__lshift_mod_256: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + mov @acc[0], @acc[4] + adc @acc[2], @acc[2] + mov @acc[1], @acc[5] + adc @acc[3], @acc[3] + sbb @acc[8], @acc[8] + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, @acc[8] + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + cmovc @acc[6], @acc[2] + cmovc @acc[7], @acc[3] + + ret +.size __lshift_mod_256,.-__lshift_mod_256 + +######################################################################## +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,\@function,4,"unwind" +.align 32 +lshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_lshift_mod_256: + call __lshift_mod_256 + dec %edx + jnz .Loop_lshift_mod_256 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + +######################################################################## +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,\@function,4,"unwind" +.align 32 +rshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[7] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_rshift_mod_256: + mov @acc[7], @acc[0] + and \$1, @acc[7] + mov 8*0($n_ptr), @acc[4] + neg @acc[7] + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + + and @acc[7], @acc[4] + and @acc[7], @acc[5] + and @acc[7], @acc[6] + and 8*3($n_ptr), @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + sbb @acc[4], @acc[4] + + shr \$1, @acc[0] + mov @acc[1], @acc[7] + shr \$1, @acc[1] + mov @acc[2], @acc[6] + shr \$1, @acc[2] + mov @acc[3], @acc[5] + shr \$1, @acc[3] + + shl \$63, @acc[7] + shl \$63, @acc[6] + or @acc[0], @acc[7] + shl \$63, @acc[5] + or @acc[6], @acc[1] + shl \$63, @acc[4] + or @acc[5], @acc[2] + or @acc[4], @acc[3] + + dec %edx + jnz .Loop_rshift_mod_256 + + mov @acc[7], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + +######################################################################## +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,\@function,4,"unwind" +.align 32 +cneg_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[8] # load a[0:3] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov @acc[8], @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], @acc[8] + or @acc[2], @acc[8] + or @acc[3], @acc[8] + mov \$-1, @acc[7] + + mov 8*0($n_ptr), @acc[4] # load n[0:3] + cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0 + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + and @acc[8], @acc[4] # n[0:3] &= mask + mov 8*3($n_ptr), @acc[7] + and @acc[8], @acc[5] + and @acc[8], @acc[6] + and @acc[8], @acc[7] + + sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0 + sbb @acc[1], @acc[5] + sbb @acc[2], @acc[6] + sbb @acc[3], @acc[7] + + or $b_org, $b_org # check condition flag + + cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3] + cmovz @acc[1], @acc[5] + mov @acc[4], 8*0($r_ptr) + cmovz @acc[2], @acc[6] + mov @acc[5], 8*1($r_ptr) + cmovz @acc[3], @acc[7] + mov @acc[6], 8*2($r_ptr) + mov @acc[7], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + +######################################################################## +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,\@function,4,"unwind" +.align 32 +sub_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + +######################################################################## +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,\@function,2,"unwind" +.align 32 +check_mod_256: +.cfi_startproc + mov 8*0($r_ptr), %rax + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + + mov %rax, @acc[0] # see if it's zero + or @acc[1], %rax + or @acc[2], %rax + or @acc[3], %rax + + sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow? + sbb 8*1($a_ptr), @acc[1] + sbb 8*2($a_ptr), @acc[2] + sbb 8*3($a_ptr), @acc[3] + sbb $a_ptr, $a_ptr + + mov \$1, %rdx + cmp \$0, %rax + cmovne %rdx, %rax + and $a_ptr, %rax +.cfi_epilogue + ret +.cfi_endproc +.size check_mod_256,.-check_mod_256 + +######################################################################## +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,\@function,4,"unwind" +.align 32 +add_n_check_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + or @acc[1], @acc[0] + or @acc[3], @acc[2] + or @acc[2], @acc[0] + mov \$1, %rax + cmovz @acc[0], %rax + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + +######################################################################## +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,\@function,4,"unwind" +.align 32 +sub_n_check_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + or @acc[1], @acc[0] + or @acc[3], @acc[2] + or @acc[2], @acc[0] + mov \$1, %rax + cmovz @acc[0], %rax + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/add_mod_384-armv8.pl b/crypto/blst_src/asm/add_mod_384-armv8.pl new file mode 100755 index 00000000000..6accdbb19a1 --- /dev/null +++ b/crypto/blst_src/asm/add_mod_384-armv8.pl @@ -0,0 +1,937 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); + +@mod=map("x$_",(4..9)); +@a=map("x$_",(10..15)); +@b=map("x$_",(16,17,19..22)); +$carry=$n_ptr; + +$code.=<<___; +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[4],@b[5],[$b_ptr,#32] + +__add_mod_384_ab_are_loaded: + adds @a[0],@a[0],@b[0] + adcs @a[1],@a[1],@b[1] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adcs @a[4],@a[4],@b[4] + adcs @a[5],@a[5],@b[5] + adc $carry,xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs @b[4],@a[4],@mod[4] + sbcs @b[5],@a[5],@mod[5] + sbcs xzr,$carry,xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + csel @a[4],@a[4],@b[4],lo + csel @a[5],@a[5],@b[5],lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384 + + stp @a[0],@a[1],[$r_ptr] + add $a_ptr,$a_ptr,#48 + stp @a[2],@a[3],[$r_ptr,#16] + add $b_ptr,$b_ptr,#48 + stp @a[4],@a[5],[$r_ptr,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + +.Loop_rshift_mod_384: + sub $b_ptr,$b_ptr,#1 + bl __rshift_mod_384 + cbnz $b_ptr,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx @b[5],@a[0],#0,#1 + and @b[0],@b[5],@mod[0] + and @b[1],@b[5],@mod[1] + adds @a[0],@a[0],@b[0] + and @b[2],@b[5],@mod[2] + adcs @a[1],@a[1],@b[1] + and @b[3],@b[5],@mod[3] + adcs @a[2],@a[2],@b[2] + and @b[4],@b[5],@mod[4] + adcs @a[3],@a[3],@b[3] + and @b[5],@b[5],@mod[5] + adcs @a[4],@a[4],@b[4] + extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1 + adcs @a[5],@a[5],@b[5] + extr @a[1],@a[2],@a[1],#1 + adc @b[5],xzr,xzr + extr @a[2],@a[3],@a[2],#1 + extr @a[3],@a[4],@a[3],#1 + extr @a[4],@a[5],@a[4],#1 + extr @a[5],@b[5],@a[5],#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + +.Loop_lshift_mod_384: + sub $b_ptr,$b_ptr,#1 + bl __lshift_mod_384 + cbnz $b_ptr,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $carry,xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs @b[4],@a[4],@mod[4] + sbcs @b[5],@a[5],@mod[5] + sbcs xzr,$carry,xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + csel @a[4],@a[4],@b[4],lo + csel @a[5],@a[5],@b[5],lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + + bl __add_mod_384_ab_are_loaded + + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr,#48] + ldp @b[2],@b[3],[$a_ptr,#64] + ldp @b[4],@b[5],[$a_ptr,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @mod[0],@mod[1],[$n_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @mod[2],@mod[3],[$n_ptr,#16] + + subs @b[0],@mod[0],@a[0] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @mod[4],@mod[5],[$n_ptr,#32] + orr $carry,@a[0],@a[1] + sbcs @b[1],@mod[1],@a[1] + orr $carry,$carry,@a[2] + sbcs @b[2],@mod[2],@a[2] + orr $carry,$carry,@a[3] + sbcs @b[3],@mod[3],@a[3] + orr $carry,$carry,@a[4] + sbcs @b[4],@mod[4],@a[4] + orr $carry,$carry,@a[5] + sbc @b[5],@mod[5],@a[5] + + cmp $carry,#0 + csetm $carry,ne + ands $b_ptr,$b_ptr,$carry + + csel @a[0],@a[0],@b[0],eq + csel @a[1],@a[1],@b[1],eq + csel @a[2],@a[2],@b[2],eq + csel @a[3],@a[3],@b[3],eq + stp @a[0],@a[1],[$r_ptr] + csel @a[4],@a[4],@b[4],eq + stp @a[2],@a[3],[$r_ptr,#16] + csel @a[5],@a[5],@b[5],eq + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[4],@b[5],[$b_ptr,#32] + + subs @a[0],@a[0],@b[0] + sbcs @a[1],@a[1],@b[1] + sbcs @a[2],@a[2],@b[2] + sbcs @a[3],@a[3],@b[3] + sbcs @a[4],@a[4],@b[4] + sbcs @a[5],@a[5],@b[5] + sbc $carry,xzr,xzr + + and @b[0],@mod[0],$carry + and @b[1],@mod[1],$carry + adds @a[0],@a[0],@b[0] + and @b[2],@mod[2],$carry + adcs @a[1],@a[1],@b[1] + and @b[3],@mod[3],$carry + adcs @a[2],@a[2],@b[2] + and @b[4],@mod[4],$carry + adcs @a[3],@a[3],@b[3] + and @b[5],@mod[5],$carry + adcs @a[4],@a[4],@b[4] + adc @a[5],@a[5],@b[5] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384 + + stp @a[0],@a[1],[$r_ptr] + add $a_ptr,$a_ptr,#48 + stp @a[2],@a[3],[$r_ptr,#16] + add $b_ptr,$b_ptr,#48 + stp @a[4],@a[5],[$r_ptr,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + add $b_ptr,$a_ptr,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @a[4],@a[5],[$r_ptr,#32] + + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + + and $r_ptr,@a[0],#1 + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $carry,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $carry,$carry,xzr + + mvn $carry,$carry + and $carry,$carry,#2 + orr $r_ptr,$r_ptr,$carry + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @a[4],@a[5],[$r_ptr,#32] + + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + + and $b_ptr,@a[0],#1 + orr $n_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $n_ptr,$n_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $n_ptr,$n_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $n_ptr,$n_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $n_ptr,$n_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @b[0],xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc @b[0],@b[0],xzr + + ldp @a[0],@a[1],[$r_ptr,#48] + ldp @a[2],@a[3],[$r_ptr,#64] + ldp @a[4],@a[5],[$r_ptr,#80] + + mvn @b[0],@b[0] + and @b[0],@b[0],#2 + orr $b_ptr,$b_ptr,@b[0] + + and $r_ptr,@a[0],#1 + orr $a_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $a_ptr,$a_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $a_ptr,$a_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $a_ptr,$a_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $a_ptr,$a_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @b[0],xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc @b[0],@b[0],xzr + + mvn @b[0],@b[0] + and @b[0],@b[0],#2 + orr $r_ptr,$r_ptr,@b[0] + + cmp $n_ptr,#0 + csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp $a_ptr,#0 + csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and $n_ptr,$n_ptr,#1 + and $a_ptr,$a_ptr,#2 + orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +if (1) { +sub vec_select { +my $sz = shift; +my @v=map("v$_",(0..5,16..21)); + +$code.=<<___; +.globl vec_select_$sz +.hidden vec_select_$sz +.type vec_select_$sz,%function +.align 5 +vec_select_$sz: + dup v6.2d, $n_ptr + ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48 +___ +for($i=0; $i<$sz-48; $i+=48) { +$code.=<<___; + bit @v[0].16b, @v[3].16b, v6.16b + ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48 + bit @v[1].16b, @v[4].16b, v6.16b + ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48 + bit @v[2].16b, @v[5].16b, v6.16b + st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48 +___ + @v = @v[6..11,0..5]; +} +$code.=<<___; + bit @v[0].16b, @v[3].16b, v6.16b + bit @v[1].16b, @v[4].16b, v6.16b + bit @v[2].16b, @v[5].16b, v6.16b + st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr] + ret +.size vec_select_$sz,.-vec_select_$sz +___ +} +vec_select(32); +vec_select(48); +vec_select(96); +vec_select(192); +vec_select(144); +vec_select(288); +} + +{ +my ($inp, $end, $step) = map("x$_", (0..2)); + +$code.=<<___; +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add $end, $end, $inp + sub $end, $end, #1 + mov $step, #64 + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + prfm pldl1keep, [$inp] + ret +.size vec_prefetch,.-vec_prefetch +___ +my $len = $end; + +$code.=<<___; +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,%function +.align 5 +vec_is_zero_16x: + ld1 {v0.2d}, [$inp], #16 + lsr $len, $len, #4 + sub $len, $len, #1 + cbz $len, .Loop_is_zero_done + +.Loop_is_zero: + ld1 {v1.2d}, [$inp], #16 + orr v0.16b, v0.16b, v1.16b + sub $len, $len, #1 + cbnz $len, .Loop_is_zero + +.Loop_is_zero_done: + dup v1.2d, v0.2d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.2d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_zero_16x,.-vec_is_zero_16x +___ +} +{ +my ($inp1, $inp2, $len) = map("x$_", (0..2)); + +$code.=<<___; +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,%function +.align 5 +vec_is_equal_16x: + ld1 {v0.2d}, [$inp1], #16 + ld1 {v1.2d}, [$inp2], #16 + lsr $len, $len, #4 + eor v0.16b, v0.16b, v1.16b + +.Loop_is_equal: + sub $len, $len, #1 + cbz $len, .Loop_is_equal_done + ld1 {v1.2d}, [$inp1], #16 + ld1 {v2.2d}, [$inp2], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b .Loop_is_equal + nop + +.Loop_is_equal_done: + dup v1.2d, v0.2d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.2d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_equal_16x,.-vec_is_equal_16x +___ +} + +print $code; + +close STDOUT; diff --git a/crypto/blst_src/asm/add_mod_384-x86_64.pl b/crypto/blst_src/asm/add_mod_384-x86_64.pl new file mode 100755 index 00000000000..a196191c108 --- /dev/null +++ b/crypto/blst_src/asm/add_mod_384-x86_64.pl @@ -0,0 +1,1500 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 384 bits add +my @acc=map("%r$_",(8..15, "ax", "bx", "bp")); + push(@acc, $a_ptr); + +$code.=<<___; +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,\@function,4,"unwind" +.align 32 +add_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__add_mod_384_a_is_loaded: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,\@function,4,"unwind" +.align 32 +add_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __add_mod_384 # add_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __add_mod_384 # add_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + +######################################################################## +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,\@function,4,"unwind" +.align 32 +rshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_rshift_mod_384: + call __rshift_mod_384 + dec %edx + jnz .Loop_rshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,\@abi-omnipotent +.align 32 +__rshift_mod_384: + mov \$1, @acc[11] + mov 8*0($n_ptr), @acc[6] + and @acc[0], @acc[11] + mov 8*1($n_ptr), @acc[7] + neg @acc[11] + mov 8*2($n_ptr), @acc[8] + and @acc[11], @acc[6] + mov 8*3($n_ptr), @acc[9] + and @acc[11], @acc[7] + mov 8*4($n_ptr), @acc[10] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], @acc[10] + adc @acc[5], @acc[11] + sbb @acc[5], @acc[5] + + shr \$1, @acc[6] + mov @acc[7], @acc[0] + shr \$1, @acc[7] + mov @acc[8], @acc[1] + shr \$1, @acc[8] + mov @acc[9], @acc[2] + shr \$1, @acc[9] + mov @acc[10], @acc[3] + shr \$1, @acc[10] + mov @acc[11], @acc[4] + shr \$1, @acc[11] + shl \$63, @acc[0] + shl \$63, @acc[1] + or @acc[6], @acc[0] + shl \$63, @acc[2] + or @acc[7], @acc[1] + shl \$63, @acc[3] + or @acc[8], @acc[2] + shl \$63, @acc[4] + or @acc[9], @acc[3] + shl \$63, @acc[5] + or @acc[10], @acc[4] + or @acc[11], @acc[5] + + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,\@function,3,"unwind" +.align 32 +div_by_2_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov $b_org, $n_ptr + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + call __rshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + +######################################################################## +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,\@function,4,"unwind" +.align 32 +lshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $r_ptr, $r_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov (%rsp), $r_ptr + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + dec %edx + jnz .Loop_lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,\@abi-omnipotent +.align 32 +__lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +######################################################################## +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +######################################################################## +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov (%rsp), $a_ptr + lea 8*6($r_ptr), $r_ptr + + mov 8*6($a_ptr), @acc[0] + mov 8*7($a_ptr), @acc[1] + mov 8*8($a_ptr), @acc[2] + mov 8*9($a_ptr), @acc[3] + mov 8*10($a_ptr), @acc[4] + mov 8*11($a_ptr), @acc[5] + + call __lshift_mod_384 + + mov \$8*6, $b_org + add (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov (%rsp), $a_ptr + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 48+8*0($a_ptr), @acc[0] + mov 48+8*1($a_ptr), @acc[1] + mov 48+8*2($a_ptr), @acc[2] + mov 48+8*3($a_ptr), @acc[3] + mov 48+8*4($a_ptr), @acc[4] + mov 48+8*5($a_ptr), @acc[5] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 48+8*0($r_ptr) + mov @acc[1], 48+8*1($r_ptr) + mov @acc[2], 48+8*2($r_ptr) + mov @acc[3], 48+8*3($r_ptr) + mov @acc[4], 48+8*4($r_ptr) + mov @acc[5], 48+8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +######################################################################## +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,\@function,4,"unwind" +.align 32 +cneg_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $b_org # condition flag +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), $b_org # load a[0:5] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $b_org, @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], $b_org + mov 8*4($a_ptr), @acc[4] + or @acc[2], $b_org + mov 8*5($a_ptr), @acc[5] + or @acc[3], $b_org + mov \$-1, @acc[11] + or @acc[4], $b_org + or @acc[5], $b_org + + mov 8*0($n_ptr), @acc[6] # load n[0:5] + cmovnz @acc[11], $b_org # mask = a[0:5] ? -1 : 0 + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + and $b_org, @acc[6] # n[0:5] &= mask + mov 8*3($n_ptr), @acc[9] + and $b_org, @acc[7] + mov 8*4($n_ptr), @acc[10] + and $b_org, @acc[8] + mov 8*5($n_ptr), @acc[11] + and $b_org, @acc[9] + mov 0(%rsp), $n_ptr # restore condition flag + and $b_org, @acc[10] + and $b_org, @acc[11] + + sub @acc[0], @acc[6] # a[0:5] ? n[0:5]-a[0:5] : 0-0 + sbb @acc[1], @acc[7] + sbb @acc[2], @acc[8] + sbb @acc[3], @acc[9] + sbb @acc[4], @acc[10] + sbb @acc[5], @acc[11] + + or $n_ptr, $n_ptr # check condition flag + + cmovz @acc[0], @acc[6] # flag ? n[0:5]-a[0:5] : a[0:5] + cmovz @acc[1], @acc[7] + cmovz @acc[2], @acc[8] + mov @acc[6], 8*0($r_ptr) + cmovz @acc[3], @acc[9] + mov @acc[7], 8*1($r_ptr) + cmovz @acc[4], @acc[10] + mov @acc[8], 8*2($r_ptr) + cmovz @acc[5], @acc[11] + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + +######################################################################## +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,\@function,4,"unwind" +.align 32 +sub_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,\@function,4,"unwind" +.align 32 +sub_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __sub_mod_384 # sub_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __sub_mod_384 # sub_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +___ +} +{ ###################################################### ret = a * (1 + i) +my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx"); +my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp")); + +$code.=<<___; +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$56, %rsp +.cfi_adjust_cfa_offset 56 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + mov $r_ptr, 8*6(%rsp) # offload r_ptr + sbb $r_ptr, $r_ptr + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $a_ptr, $a_ptr + + mov @acc[0], 8*0(%rsp) # offload a->re + a->im [without carry] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1(%rsp) + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2(%rsp) + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3(%rsp) + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4(%rsp) + and $a_ptr, @acc[0] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5(%rsp) + and $a_ptr, @acc[1] + mov 8*5($n_ptr), @acc[5] + and $a_ptr, @acc[2] + and $a_ptr, @acc[3] + and $a_ptr, @acc[4] + and $a_ptr, @acc[5] + mov 8*6(%rsp), $a_ptr # restore r_ptr + + add @acc[0], @acc[6] + mov 8*0(%rsp), @acc[0] # restore a->re + a->im + adc @acc[1], @acc[7] + mov 8*1(%rsp), @acc[1] + adc @acc[2], @acc[8] + mov 8*2(%rsp), @acc[2] + adc @acc[3], @acc[9] + mov 8*3(%rsp), @acc[3] + adc @acc[4], @acc[10] + mov 8*4(%rsp), @acc[4] + adc @acc[5], @acc[11] + mov 8*5(%rsp), @acc[5] + + mov @acc[6], 8*0($a_ptr) # ret->re = a->re - a->im + mov @acc[0], @acc[6] + mov @acc[7], 8*1($a_ptr) + mov @acc[8], 8*2($a_ptr) + mov @acc[1], @acc[7] + mov @acc[9], 8*3($a_ptr) + mov @acc[10], 8*4($a_ptr) + mov @acc[2], @acc[8] + mov @acc[11], 8*5($a_ptr) + + sub 8*0($n_ptr), @acc[0] + mov @acc[3], @acc[9] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[4], @acc[10] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($a_ptr) # ret->im = a->re + a->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($a_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($a_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($a_ptr) + mov @acc[4], 8*10($a_ptr) + mov @acc[5], 8*11($a_ptr) + + mov 56+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 56+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 56+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 56+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 56+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 56+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 56+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +___ +} +{ ###################################################### +my ($r_ptr,$n_ptr) = ("%rdi","%rsi"); +my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp")); + +$code.=<<___; +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384: +.cfi_startproc +.cfi_end_prologue + mov 8*0($r_ptr), @acc[0] + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + + xor %rax, %rax + mov @acc[0], $r_ptr + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, $r_ptr + and \$2, %rax + or $r_ptr, %rax # pack sign and parity + +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*6($r_ptr), @acc[0] # sgn0(a->im) + mov 8*7($r_ptr), @acc[1] + mov 8*8($r_ptr), @acc[2] + mov 8*9($r_ptr), @acc[3] + mov 8*10($r_ptr), @acc[4] + mov 8*11($r_ptr), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), %rax # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + mov 8*0(%rax), @acc[0] + mov 8*1(%rax), @acc[1] + mov 8*2(%rax), @acc[2] + mov 8*3(%rax), @acc[3] + mov 8*4(%rax), @acc[4] + mov 8*5(%rax), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp), %rbx +.cfi_restore %rbx + mov 16(%rsp), %rbp +.cfi_restore %rbp + lea 24(%rsp), %rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +} +if (0) { +my $inp = $win64 ? "%rcx" : "%rdi"; +$code.=<<___; +.globl nbits_384 +.hidden nbits_384 +.type nbits_384,\@abi-omnipotent +.align 32 +nbits_384: + mov 8*5($inp), %r8 + mov 8*4($inp), %r9 + mov 8*3($inp), %r10 + mov 8*2($inp), %r11 + mov \$-1, %rdx + mov \$127, %eax + bsr %r8, %r8 + cmovnz %rdx,%r9 + cmovz %rax,%r8 + bsr %r9, %r9 + cmovnz %rdx,%r10 + cmovz %rax,%r9 + xor \$63,%r8 + bsr %r10, %r10 + cmovnz %rdx, %r11 + cmovz %rax, %r10 + xor \$63,%r9 + add %r8, %r9 + mov 8*1($inp), %r8 + bsr %r11, %r11 + cmovnz %rdx, %r8 + cmovz %rax, %r11 + xor \$63, %r10 + add %r9, %r10 + mov 8*0($inp), %r9 + bsr %r8, %r8 + cmovnz %rdx, %r9 + cmovz %rax, %r8 + xor \$63, %r11 + add %r10, %r11 + bsr %r9, %r9 + cmovz %rax, %r9 + xor \$63, %r8 + add %r11, %r8 + xor \$63, %r9 + add %r8, %r9 + mov \$384, %eax + sub %r9, %rax + ret +.size nbits_384,.-nbits_384 +___ +} + +if (1) { +my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d") + : ("%rdi", "%rsi", "%rdx", "%ecx"); + +sub vec_select { +my $sz = shift; +my $half = $sz/2; +my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3)); + +$code.=<<___; +.globl vec_select_$sz +.hidden vec_select_$sz +.type vec_select_$sz,\@abi-omnipotent +.align 32 +vec_select_$sz: + movd $select, %xmm5 + pxor %xmm4,%xmm4 + pshufd \$0,%xmm5,%xmm5 # broadcast + movdqu ($inp1),$xmm0 + lea $half($inp1),$inp1 + pcmpeqd %xmm4,%xmm5 + movdqu ($inp2),$xmm1 + lea $half($inp2),$inp2 + pcmpeqd %xmm5,%xmm4 + lea $half($out),$out +___ +for($i=0; $i<$sz-16; $i+=16) { +$code.=<<___; + pand %xmm4,$xmm0 + movdqu $i+16-$half($inp1),$xmm2 + pand %xmm5,$xmm1 + movdqu $i+16-$half($inp2),$xmm3 + por $xmm1,$xmm0 + movdqu $xmm0,$i-$half($out) +___ + ($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1); +} +$code.=<<___; + pand %xmm4,$xmm0 + pand %xmm5,$xmm1 + por $xmm1,$xmm0 + movdqu $xmm0,$i-$half($out) + ret +.size vec_select_$sz,.-vec_select_$sz +___ +} +vec_select(32); +vec_select(48); +vec_select(96); +vec_select(192); +vec_select(144); +vec_select(288); +} + +{ +my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi"); + +$code.=<<___; +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,\@abi-omnipotent +.align 32 +vec_prefetch: + leaq -1($inp,$end), $end + mov \$64, %rax + xor %r8, %r8 + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + prefetchnta ($inp) + ret +.size vec_prefetch,.-vec_prefetch +___ +my $len = $win64 ? "%edx" : "%esi"; + +$code.=<<___; +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,\@abi-omnipotent +.align 32 +vec_is_zero_16x: + shr \$4, $len + movdqu ($inp), %xmm0 + lea 16($inp), $inp + +.Loop_is_zero: + dec $len + jz .Loop_is_zero_done + movdqu ($inp), %xmm1 + lea 16($inp), $inp + por %xmm1, %xmm0 + jmp .Loop_is_zero + +.Loop_is_zero_done: + pshufd \$0x4e, %xmm0, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, %rax + inc $len # now it's 1 + test %rax, %rax + cmovnz $len, %eax + xor \$1, %eax + ret +.size vec_is_zero_16x,.-vec_is_zero_16x +___ +} +{ +my ($inp1, $inp2, $len) = $win64 ? ("%rcx", "%rdx", "%r8d") + : ("%rdi", "%rsi", "%edx"); +$code.=<<___; +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,\@abi-omnipotent +.align 32 +vec_is_equal_16x: + shr \$4, $len + movdqu ($inp1), %xmm0 + movdqu ($inp2), %xmm1 + sub $inp1, $inp2 + lea 16($inp1), $inp1 + pxor %xmm1, %xmm0 + +.Loop_is_equal: + dec $len + jz .Loop_is_equal_done + movdqu ($inp1), %xmm1 + movdqu ($inp1,$inp2), %xmm2 + lea 16($inp1), $inp1 + pxor %xmm2, %xmm1 + por %xmm1, %xmm0 + jmp .Loop_is_equal + +.Loop_is_equal_done: + pshufd \$0x4e, %xmm0, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, %rax + inc $len # now it's 1 + test %rax, %rax + cmovnz $len, %eax + xor \$1, %eax + ret +.size vec_is_equal_16x,.-vec_is_equal_16x +___ +} +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/add_mod_384x384-x86_64.pl b/crypto/blst_src/asm/add_mod_384x384-x86_64.pl new file mode 100755 index 00000000000..6ee3cf8760a --- /dev/null +++ b/crypto/blst_src/asm/add_mod_384x384-x86_64.pl @@ -0,0 +1,260 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +############################################################ 384x384 add/sub +# Double-width addition/subtraction modulo n<<384, as opposite to +# naively expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +{ +my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +.type __add_mod_384x384,\@abi-omnipotent +.align 32 +__add_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + add 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + adc 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + adc 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + adc 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + adc 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + adc 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + adc 8*6($b_org), @acc[6] + mov @acc[1], 8*1($r_ptr) + adc 8*7($b_org), @acc[7] + mov @acc[2], 8*2($r_ptr) + adc 8*8($b_org), @acc[8] + mov @acc[4], 8*4($r_ptr) + mov @acc[6], @acc[0] + adc 8*9($b_org), @acc[9] + mov @acc[3], 8*3($r_ptr) + mov @acc[7], @acc[1] + adc 8*10($b_org), @acc[10] + mov @acc[5], 8*5($r_ptr) + mov @acc[8], @acc[2] + adc 8*11($b_org), @acc[11] + mov @acc[9], @acc[3] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[10], @acc[4] + sbb 8*2($n_ptr), @acc[8] + sbb 8*3($n_ptr), @acc[9] + sbb 8*4($n_ptr), @acc[10] + mov @acc[11], @acc[5] + sbb 8*5($n_ptr), @acc[11] + sbb \$0, $b_org + + cmovc @acc[0], @acc[6] + cmovc @acc[1], @acc[7] + cmovc @acc[2], @acc[8] + mov @acc[6], 8*6($r_ptr) + cmovc @acc[3], @acc[9] + mov @acc[7], 8*7($r_ptr) + cmovc @acc[4], @acc[10] + mov @acc[8], 8*8($r_ptr) + cmovc @acc[5], @acc[11] + mov @acc[9], 8*9($r_ptr) + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,\@function,4,"unwind" +.align 32 +add_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384x384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,\@function,4,"unwind" +.align 32 +sub_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384x384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/arm-xlate.pl b/crypto/blst_src/asm/arm-xlate.pl new file mode 100755 index 00000000000..35aab37407b --- /dev/null +++ b/crypto/blst_src/asm/arm-xlate.pl @@ -0,0 +1,386 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ARM assembler distiller/adapter by \@dot-asm. + +use strict; + +################################################################ +# Recognized "flavour"-s are: +# +# linux[32|64] GNU assembler, effectively pass-through +# ios[32|64] global symbols' decorations, PIC tweaks, etc. +# win[32|64] Visual Studio armasm-specific directives +# coff[32|64] e.g. clang --target=arm-windows ... +# +my $flavour = shift; + $flavour = "linux" if (!$flavour or $flavour eq "void"); + +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +my %GLOBALS; +my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0; +my $in_proc; # used with 'windows' flavour + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch +my $fpu = sub { } if ($flavour !~ /linux/); # omit .fpu + +my $rodata = sub { + SWITCH: for ($flavour) { + /linux/ && return ".section\t.rodata"; + /ios/ && return ".section\t__TEXT,__const"; + /coff/ && return ".section\t.rdata,\"dr\""; + /win/ && return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8"; + last; + } +}; + +my $hidden = sub { + if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } +} if ($flavour !~ /linux/); + +my $comm = sub { + my @args = split(/,\s*/,shift); + my $name = @args[0]; + my $global = \$GLOBALS{$name}; + my $ret; + + if ($flavour =~ /ios32/) { + $ret = ".comm\t_$name,@args[1]\n"; + $ret .= ".non_lazy_symbol_pointer\n"; + $ret .= "$name:\n"; + $ret .= ".indirect_symbol\t_$name\n"; + $ret .= ".long\t0\n"; + $ret .= ".previous"; + $name = "_$name"; + } elsif ($flavour =~ /win/) { + $ret = "\tCOMMON\t|$name|,@args[1]"; + } elsif ($flavour =~ /coff/) { + $ret = ".comm\t$name,@args[1]"; + } else { + $ret = ".comm\t".join(',',@args); + } + + $$global = $name; + $ret; +}; + +my $globl = sub { + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + SWITCH: for ($flavour) { + /ios/ && do { $name = "_$name"; last; }; + /win/ && do { $ret = ""; last; }; + } + + $ret = ".globl $name" if (!defined($ret)); + $$global = $name; + $ret; +}; +my $global = $globl; + +my $extern = sub { + &$globl(@_); + if ($flavour =~ /win/) { + return "\tEXTERN\t@_"; + } + return; # return nothing +}; + +my $type = sub { + my $arg = join(',',@_); + my $ret; + + SWITCH: for ($flavour) { + /ios32/ && do { if ($arg =~ /(\w+),\s*%function/) { + $ret = "#ifdef __thumb2__\n" . + ".thumb_func $1\n" . + "#endif"; + } + last; + }; + /win/ && do { if ($arg =~ /(\w+),\s*%(function|object)/) { + my $type = "[DATA]"; + if ($2 eq "function") { + $in_proc = $1; + $type = "[FUNC]"; + } + $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type" + : ""; + } + last; + }; + /coff/ && do { if ($arg =~ /(\w+),\s*%function/) { + $ret = ".def $1;\n". + ".type 32;\n". + ".endef"; + } + last; + }; + } + return $ret; +} if ($flavour !~ /linux/); + +my $size = sub { + if ($in_proc && $flavour =~ /win/) { + $in_proc = undef; + return "\tENDP"; + } +} if ($flavour !~ /linux/); + +my $inst = sub { + if ($flavour =~ /win/) { "\tDCDU\t".join(',',@_); } + else { ".long\t".join(',',@_); } +} if ($flavour !~ /linux/); + +my $asciz = sub { + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { if ($flavour =~ /win/) { + "\tDCB\t$line,0\n\tALIGN\t4"; + } else { + ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; + } + } else { ""; } +}; + +my $align = sub { + "\tALIGN\t".2**@_[0]; +} if ($flavour =~ /win/); + $align = sub { + ".p2align\t".@_[0]; +} if ($flavour =~ /coff/); + +my $byte = sub { + "\tDCB\t".join(',',@_); +} if ($flavour =~ /win/); + +my $short = sub { + "\tDCWU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $word = sub { + "\tDCDU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $long = $word if ($flavour =~ /win/); + +my $quad = sub { + "\tDCQU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $skip = sub { + "\tSPACE\t".shift; +} if ($flavour =~ /win/); + +my $code = sub { + "\tCODE@_[0]"; +} if ($flavour =~ /win/); + +my $thumb = sub { # .thumb should appear prior .text in source + "# define ARM THUMB\n" . + "\tTHUMB"; +} if ($flavour =~ /win/); + +my $text = sub { + "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM"); +} if ($flavour =~ /win/); + +my $syntax = sub {} if ($flavour =~ /win/); # omit .syntax + +my $rva = sub { + # .rva directive comes in handy only on 32-bit Windows, i.e. it can + # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections. + # However! Corresponding compilers don't seem to bet on PIC, which + # raises the question why would assembler programmer have to jump + # through the hoops? But just in case, it would go as following: + # + # ldr r1,.LOPENSSL_armcap + # ldr r2,.LOPENSSL_armcap+4 + # adr r0,.LOPENSSL_armcap + # bic r1,r1,#1 ; de-thumb-ify link.exe's ideas + # sub r0,r0,r1 ; r0 is image base now + # ldr r0,[r0,r2] + # ... + #.LOPENSSL_armcap: + # .rva .LOPENSSL_armcap ; self-reference + # .rva OPENSSL_armcap_P ; real target + # + # Non-position-independent [and ISA-neutral] alternative is so much + # simpler: + # + # ldr r0,.LOPENSSL_armcap + # ldr r0,[r0] + # ... + #.LOPENSSL_armcap: + # .long OPENSSL_armcap_P + # + "\tDCDU\t@_[0]\n\tRELOC\t2" +} if ($flavour =~ /win(?!64)/); + +################################################################ +# some broken instructions in Visual Studio armasm[64]... + +my $it = sub {} if ($flavour =~ /win32/); # omit 'it' + +my $ext = sub { + "\text8\t".join(',',@_); +} if ($flavour =~ /win64/); + +my $csel = sub { + my ($args,$comment) = split(m|\s*//|,shift); + my @regs = split(m|,\s*|,$args); + my $cond = pop(@regs); + + "\tcsel$cond\t".join(',',@regs); +} if ($flavour =~ /win64/); + +my $csetm = sub { + my ($args,$comment) = split(m|\s*//|,shift); + my @regs = split(m|,\s*|,$args); + my $cond = pop(@regs); + + "\tcsetm$cond\t".join(',',@regs); +} if ($flavour =~ /win64/); + +# ... then conditional branch instructions are also broken, but +# maintaining all the variants is tedious, so I kludge-fix it +# elsewhere... +################################################################ +my $adrp = sub { + my ($args,$comment) = split(m|\s*//|,shift); + "\tadrp\t$args\@PAGE"; +} if ($flavour =~ /ios64/); + +my $paciasp = sub { + ($flavour =~ /linux/) ? "\t.inst\t0xd503233f" + : &$inst(0xd503233f); +}; + +my $autiasp = sub { + ($flavour =~ /linux/) ? "\t.inst\t0xd50323bf" + : &$inst(0xd50323bf); +}; + +sub range { + my ($r,$sfx,$start,$end) = @_; + + join(",",map("$r$_$sfx",($start..$end))); +} + +sub expand_line { + my $line = shift; + my @ret = (); + + pos($line)=0; + + while ($line =~ m/\G[^@\/\{\"]*/g) { + if ($line =~ m/\G(@|\/\/|$)/gc) { + last; + } + elsif ($line =~ m/\G\{/gc) { + my $saved_pos = pos($line); + $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; + pos($line) = $saved_pos; + $line =~ m/\G[^\}]*\}/g; + } + elsif ($line =~ m/\G\"/gc) { + $line =~ m/\G[^\"]*\"/g; + } + } + + $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; + + if ($flavour =~ /win/) { + # adjust alignment hints, "[rN,:32]" -> "[rN@32]" + $line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/; + # adjust local labels, ".Lwhatever" -> "|$Lwhatever|" + $line =~ s/\.(L\w{2,})/|\$$1|/g; + # omit "#:lo12:" on win64 + $line =~ s/#:lo12://; + } elsif ($flavour =~ /coff(?!64)/) { + $line =~ s/\.L(\w{2,})/(\$ML$1)/g; + } elsif ($flavour =~ /ios64/) { + $line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/; + } + + if ($flavour =~ /64/) { + # "vX.Md[N]" -> "vX.d[N] + $line =~ s/\b(v[0-9]+)\.[1-9]+([bhsd]\[[0-9]+\])/$1.$2/; + } + + return $line; +} + +while(my $line=<>) { + + # fix up assembler-specific commentary delimiter + $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/); + + if ($line =~ m/^\s*(#|@|;|\/\/)/) { print $line; next; } + + $line =~ s|/\*.*\*/||; # get rid of C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + if ($label) { + $label = ($GLOBALS{$label} or $label); + if ($flavour =~ /win/) { + $label =~ s|^\.L(?=\w)|\$L|; + printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : ""); + } else { + $label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/); + printf "%s:", $label; + } + } + } + + if ($line !~ m/^[#@;]/) { + $line =~ s|^\s*(\.?)(\S+)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $opcode; + if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { + $opcode = eval("\$$1_$2"); + } else { + $opcode = eval("\$$mnemonic"); + } + + my $arg=expand_line($line); + + if (ref($opcode) eq 'CODE') { + $line = &$opcode($arg); + } elsif ($mnemonic) { + if ($flavour =~ /win64/) { + # "b.cond" -> "bcond", kludge-fix:-( + $mnemonic =~ s/^b\.([a-z]{2}$)/b$1/; + } + $line = $c.$mnemonic; + $line.= "\t$arg" if ($arg ne ""); + } + } + + print $line if ($line); + print "\n"; +} + +print "\tEND\n" if ($flavour =~ /win/); + +close STDOUT; diff --git a/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl b/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl new file mode 100755 index 00000000000..ced8c6c37e9 --- /dev/null +++ b/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl @@ -0,0 +1,586 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 - +# on Cortex-A57. +# +# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, +# const vec256 modx); +# +$python_ref.=<<'___'; +def ct_inverse_mod_256(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 512 // k - 1): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smul_256_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smul_512x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 512 % k + k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 512 % k + k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + mod <<= 512 - mod.bit_length() # align to the left + if v < 0: + v += mod + if v < 0: + v += mod + elif v == 1<<512 + v -= mod + + return v & (2**512 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); +my @acc=map("x$_",(4..11)); +my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17)); +my $cnt = $n_ptr; +my @t = map("x$_",(19..26)); +my ($a_lo, $b_lo) = @acc[3,7]; + +$frame = 16+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + paciasp + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #$frame + + ldp @acc[0], @acc[1], [$in_ptr,#8*0] + ldp @acc[2], @acc[3], [$in_ptr,#8*2] + + add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot + and $in_ptr, $in_ptr, #-512 // in the frame... + str $out_ptr, [sp] + + ldp @acc[4], @acc[5], [$n_ptr,#8*0] + ldp @acc[6], @acc[7], [$n_ptr,#8*2] + + stp @acc[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*2] + stp @acc[4], @acc[5], [$in_ptr,#8*4] // copy modulus to |b| + stp @acc[6], @acc[7], [$in_ptr,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str $f0,[$out_ptr,#8*8] // initialize |u| with |f0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str $f0, [$out_ptr,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr @acc[4], [$in_ptr,#8*8] // |u| + ldr @acc[5], [$in_ptr,#8*13] // |v| + madd @acc[0], $f_, @acc[4], xzr // |u|*|f0| + madd @acc[0], $g_, @acc[5], @acc[0] // |v|*|g0| + str @acc[0], [$out_ptr,#8*4] + asr @acc[1], @acc[0], #63 // sign extenstion + stp @acc[1], @acc[1], [$out_ptr,#8*5] + stp @acc[1], @acc[1], [$out_ptr,#8*7] + + madd @acc[0], $f0, @acc[4], xzr // |u|*|f1| + madd @acc[0], $g0, @acc[5], @acc[0] // |v|*|g1| + str @acc[0], [$out_ptr,#8*9] + asr @acc[1], @acc[0], #63 // sign extenstion + stp @acc[1], @acc[1], [$out_ptr,#8*10] + stp @acc[1], @acc[1], [$out_ptr,#8*12] +___ +for($i=2; $i<15; $i++) { +$code.=<<___; + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add $out_ptr, $out_ptr, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc @t[3], @t[3], @t[4] + str @t[3], [$out_ptr,#8*4] + + mov $f_, $f0 // corrected |f1| + mov $g_, $g0 // corrected |g1| + add $out_ptr, $out_ptr, #8*5 // pointer to destination |v| + bl __smul_256x63 +___ +$code.=<<___ if ($i>7); + bl __smul_512x63_tail +___ +$code.=<<___ if ($i<=7); + adc @t[3], @t[3], @t[4] + stp @t[3], @t[3], [$out_ptr,#8*4] + stp @t[3], @t[3], [$out_ptr,#8*6] +___ +} +$code.=<<___; + ////////////////////////////////////////// two[!] last iterations + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr $a_lo, [$in_ptr,#8*0] // just load + ldr $b_lo, [$in_ptr,#8*4] + bl __inner_loop_62_256 + + mov $f_, $f1 + mov $g_, $g1 + ldr $out_ptr, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh @t[1], @acc[3], $g_ // figure out top-most limb + ldp @acc[4], @acc[5], [$nx_ptr,#8*0] + adc @t[4], @t[4], @t[6] + ldp @acc[6], @acc[7], [$nx_ptr,#8*2] + + add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1 + asr @t[0], @t[1], #63 // sign as mask + + and @t[4], @acc[4], @t[0] // add mod<<256 conditionally + and @t[5], @acc[5], @t[0] + adds @acc[0], @acc[0], @t[4] + and @t[6], @acc[6], @t[0] + adcs @acc[1], @acc[1], @t[5] + and @t[7], @acc[7], @t[0] + adcs @acc[2], @acc[2], @t[6] + adcs @acc[3], @t[3], @t[7] + adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1 + + neg @t[0], @t[1] + orr @t[1], @t[1], @t[0] // excess bit or sign as mask + asr @t[0], @t[0], #63 // excess bit as mask + + and @acc[4], @acc[4], @t[1] // mask |mod| + and @acc[5], @acc[5], @t[1] + and @acc[6], @acc[6], @t[1] + and @acc[7], @acc[7], @t[1] + + eor @acc[4], @acc[4], @t[0] // conditionally negate |mod| + eor @acc[5], @acc[5], @t[0] + adds @acc[4], @acc[4], @t[0], lsr#63 + eor @acc[6], @acc[6], @t[0] + adcs @acc[5], @acc[5], xzr + eor @acc[7], @acc[7], @t[0] + adcs @acc[6], @acc[6], xzr + adc @acc[7], @acc[7], xzr + + adds @acc[0], @acc[0], @acc[4] // final adjustment for |mod|<<256 + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + stp @acc[0], @acc[1], [$out_ptr,#8*4] + adc @acc[3], @acc[3], @acc[7] + stp @acc[2], @acc[3], [$out_ptr,#8*6] + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + autiasp + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: +___ +for($j=0; $j<2; $j++) { +my $f_ = $f_; $f_ = $g_ if ($j); +my @acc = @acc; @acc = @acc[4..7] if ($j); +my $k = 8*8+8*5*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) + asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) + ldr @t[3+$j], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) + sub $f_, $f_, $f1 + eor @acc[1], @acc[1], $f1 + adds @acc[0], @acc[0], $f1, lsr#63 + eor @acc[2], @acc[2], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + eor @t[3+$j], @t[3+$j], $f1 + umulh @t[0], @acc[0], $f_ + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $f_ + adcs @t[3+$j], @t[3+$j], xzr + umulh @t[2], @acc[2], $f_ +___ +$code.=<<___ if ($j!=0); + adc $g1, xzr, xzr // used in __smul_512x63_tail +___ +$code.=<<___; + mul @acc[0], @acc[0], $f_ + cmp $f_, #0 + mul @acc[1], @acc[1], $f_ + csel @t[3+$j], @t[3+$j], xzr, ne + mul @acc[2], @acc[2], $f_ + adds @acc[1], @acc[1], @t[0] + mul @t[5+$j], @acc[3], $f_ + adcs @acc[2], @acc[2], @t[1] + adcs @t[5+$j], @t[5+$j], @t[2] +___ +$code.=<<___ if ($j==0); + adc @t[7], xzr, xzr +___ +} +$code.=<<___; + adc @t[7], @t[7], xzr + + adds @acc[0], @acc[0], @acc[4] + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @t[5], @t[5], @t[6] + stp @acc[2], @t[5], [$out_ptr,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh @t[5], @acc[3], $f_ + ldp @acc[1], @acc[2], [$in_ptr,#8*18] // load rest of |v| + adc @t[7], @t[7], xzr + ldr @acc[3], [$in_ptr,#8*20] + and @t[3], @t[3], $f_ + + umulh @acc[7], @acc[7], $g_ // resume |v|*|g1| chain + + sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain + asr @t[6], @t[5], #63 + + eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v| + eor @acc[2], @acc[2], $f1 + adds @acc[1], @acc[1], $g1 + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + umulh @t[0], @t[4], $g_ + adc @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $g_ + add @acc[7], @acc[7], @t[7] + umulh @t[2], @acc[2], $g_ + + mul @acc[0], @t[4], $g_ + mul @acc[1], @acc[1], $g_ + adds @acc[0], @acc[0], @acc[7] + mul @acc[2], @acc[2], $g_ + adcs @acc[1], @acc[1], @t[0] + mul @t[3], @acc[3], $g_ + adcs @acc[2], @acc[2], @t[1] + adcs @t[3], @t[3], @t[2] + adc @t[4], xzr, xzr // used in the final step + + adds @acc[0], @acc[0], @t[5] + adcs @acc[1], @acc[1], @t[6] + adcs @acc[2], @acc[2], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*4] + adcs @t[3], @t[3], @t[6] // carry is used in the final step + stp @acc[2], @t[3], [$out_ptr,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: +___ +for($j=0; $j<2; $j++) { +my $f0 = $f0; $f0 = $g0 if ($j); +my @acc = @acc; @acc = @acc[4..7] if ($j); +my $k = 8*4*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) + asr @t[5], $f0, #63 // |f0|'s sign as mask (or |g0|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor @t[6], $f0, @t[5] // conditionally negate |f0| (or |g0|) + + eor @acc[0], @acc[0], @t[5] // conditionally negate |a| (or |b|) + sub @t[6], @t[6], @t[5] + eor @acc[1], @acc[1], @t[5] + adds @acc[0], @acc[0], @t[5], lsr#63 + eor @acc[2], @acc[2], @t[5] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[5] + umulh @t[0], @acc[0], @t[6] + adcs @acc[2], @acc[2], xzr + umulh @t[1], @acc[1], @t[6] + adc @acc[3], @acc[3], xzr + umulh @t[2], @acc[2], @t[6] + and @t[5], @t[5], @t[6] + umulh @t[3+$j], @acc[3], @t[6] + neg @t[5], @t[5] + + mul @acc[0], @acc[0], @t[6] + mul @acc[1], @acc[1], @t[6] + mul @acc[2], @acc[2], @t[6] + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], @t[1] + adcs @acc[3], @acc[3], @t[2] + adc @t[3+$j], @t[3+$j], @t[5] +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[4] + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + adcs @acc[3], @acc[3], @acc[7] + adc @acc[4], @t[3], @t[4] + + extr @acc[0], @acc[1], @acc[0], #31 + extr @acc[1], @acc[2], @acc[1], #31 + extr @acc[2], @acc[3], @acc[2], #31 + asr @t[4], @acc[4], #63 // result's sign as mask + extr @acc[3], @acc[4], @acc[3], #31 + + eor @acc[0], @acc[0], @t[4] // ensure the result is positive + eor @acc[1], @acc[1], @t[4] + adds @acc[0], @acc[0], @t[4], lsr#63 + eor @acc[2], @acc[2], @t[4] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[4] + adcs @acc[2], @acc[2], xzr + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adc @acc[3], @acc[3], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + + eor $f0, $f0, @t[4] // adjust |f/g| accordingly + eor $g0, $g0, @t[4] + sub $f0, $f0, @t[4] + sub $g0, $g0, @t[4] + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +___ + +{ +my @a = @acc[0..3]; +my @b = @acc[4..7]; +my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]); + +$code.=<<___; +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp @a[2], @a[3], [$in_ptr,#8*2] + ldp @b[2], @b[3], [$in_ptr,#8*6] + ldp @a[0], @a[1], [$in_ptr,#8*0] + ldp @b[0], @b[1], [$in_ptr,#8*4] + +.Lab_approximation_31_256_loaded: + orr @t[0], @a[3], @b[3] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + csel @a[2], @a[2], @a[1], ne + orr @t[0], @a[3], @b[3] // and ones before top-most, ... + csel @b[2], @b[2], @b[1], ne + + cmp @t[0], #0 + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + csel @a[2], @a[2], @a[0], ne + orr @t[0], @a[3], @b[3] // and one more, ... + csel @b[2], @b[2], @b[0], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + neg @t[1], @t[0] + + lslv @a[3], @a[3], @t[0] // align high limbs to the left + lslv @b[3], @b[3], @t[0] + lsrv @a[2], @a[2], @t[1] + lsrv @b[2], @b[2], @t[1] + and @a[2], @a[2], @t[1], asr#6 + and @b[2], @b[2], @t[1], asr#6 + orr $a_lo, @a[3], @a[2] + orr $b_lo, @b[3], @b[2] + + bfxil $a_lo, @a[0], #0, #31 + bfxil $b_lo, @b[0], #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov $cnt, #31 + mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov $bias,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + and @t[0], $b_lo, @t[3] + sub @t[1], $b_lo, $a_lo // |b_|-|a_| + subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $fg1 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| + csel $fg0, $fg0, @t[0], hs + lsr $a_lo, $a_lo, #1 + and @t[0], $fg1, @t[3] + and @t[1], $bias, @t[3] + sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add $fg1, $fg1, $fg1 // |f1|<<=1 + add $fg0, $fg0, @t[1] + sub $fg1, $fg1, $bias + cbnz $cnt, .Loop_31_256 + + mov $bias, #0x7FFFFFFF + ubfx $f0, $fg0, #0, #32 + ubfx $g0, $fg0, #32, #32 + ubfx $f1, $fg1, #0, #32 + ubfx $g1, $fg1, #32, #32 + sub $f0, $f0, $bias // remove bias + sub $g0, $g0, $bias + sub $f1, $f1, $bias + sub $g1, $g1, $bias + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov $f0, #1 // |f0|=1 + mov $g0, #0 // |g0|=0 + mov $f1, #0 // |f1|=0 + mov $g1, #1 // |g1|=1 + +.Loop_62_256: + sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + and @t[0], $b_lo, @t[3] + sub @t[1], $b_lo, $a_lo // |b_|-|a_| + subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $f0 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov @t[1], $g0 + csel $f0, $f0, $f1, hs // exchange |f0| and |f1| + csel $f1, $f1, @t[0], hs + csel $g0, $g0, $g1, hs // exchange |g0| and |g1| + csel $g1, $g1, @t[1], hs + lsr $a_lo, $a_lo, #1 + and @t[0], $f1, @t[3] + and @t[1], $g1, @t[3] + add $f1, $f1, $f1 // |f1|<<=1 + add $g1, $g1, $g1 // |g1|<<=1 + sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) + cbnz $cnt, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 +___ +} + +foreach(split("\n",$code)) { + s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/; + print $_,"\n"; +} +close STDOUT; diff --git a/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl b/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl new file mode 100755 index 00000000000..24ab5452930 --- /dev/null +++ b/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl @@ -0,0 +1,837 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake. +# +# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, +# const vec256 modx); +# +$python_ref.=<<'___'; +def ct_inverse_mod_256(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 512 // k - 1): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulq_256_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulq_512x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 512 % k + k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 512 % k + k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + mod <<= 512 - mod.bit_length() # align to the left + if v < 0: + v += mod + if v < 0: + v += mod + elif v == 1<<512 + v -= mod + + return v & (2**512 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15)); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edx"; + +$frame = 8*6+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256,\@function,4,"unwind" +.align 32 +ct_inverse_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + + mov 8*0($n_ptr), @acc[4] # load modulus + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + mov 8*3($n_ptr), @acc[7] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + + mov @acc[4], 8*4(%rax) # copy modulus to |b| + mov @acc[5], 8*5(%rax) + mov @acc[6], 8*6(%rax) + mov @acc[7], 8*7(%rax) + mov %rax, $in_ptr + + ################################# first iteration + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*0(%rsp) # corrected |f0| + #mov $g0, 8*1(%rsp) # corrected |g0| + mov $f0, 8*8($out_ptr) # initialize |u| with |f0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*2(%rsp) # corrected |f1| + #mov $g0, 8*3(%rsp) # corrected |g1| + mov $f0, 8*9($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + mov $f0, 8*0(%rsp) # corrected |f0| + mov $g0, 8*1(%rsp) # corrected |g0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*2(%rsp) # corrected |f1| + #mov $g0, 8*3(%rsp) # corrected |g1| + + mov 8*8($in_ptr), @acc[0] # |u| + mov 8*13($in_ptr), @acc[4] # |v| + mov @acc[0], @acc[1] + imulq 8*0(%rsp), @acc[0] # |u|*|f0| + mov @acc[4], @acc[5] + imulq 8*1(%rsp), @acc[4] # |v|*|g0| + add @acc[4], @acc[0] + mov @acc[0], 8*4($out_ptr) # destination |u| + sar \$63, @acc[0] # sign extension + mov @acc[0], 8*5($out_ptr) + mov @acc[0], 8*6($out_ptr) + mov @acc[0], 8*7($out_ptr) + mov @acc[0], 8*8($out_ptr) + lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor + + imulq $f0, @acc[1] # |u|*|f1| + imulq $g0, @acc[5] # |v|*|g1| + add @acc[5], @acc[1] + mov @acc[1], 8*9($out_ptr) # destination |v| + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + mov @acc[1], 8*12($out_ptr) + mov @acc[1], 8*13($out_ptr) +___ +for($i=2; $i<15; $i++) { +my $smul_512x63 = $i>8 ? "__smulq_512x63" + : "__smulq_256x63"; +$code.=<<___; + xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + mov $f0, 8*0(%rsp) # corrected |f0| + mov $g0, 8*1(%rsp) # corrected |g0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + mov $f0, 8*2(%rsp) # corrected |f1| + mov $g0, 8*3(%rsp) # corrected |g1| + + mov 8*0(%rsp), $f0 # |f0| + mov 8*1(%rsp), $g0 # |g0| + lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*4($out_ptr), $out_ptr # pointer to destination |u| + call __smulq_256x63 + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*5($out_ptr),$out_ptr # pointer to destination |v| + call $smul_512x63 +___ +$code.=<<___ if ($i==8); + sar \$63, %rbp # sign extension + mov %rbp, 8*5($out_ptr) + mov %rbp, 8*6($out_ptr) + mov %rbp, 8*7($out_ptr) +___ +} +$code.=<<___; + ################################# two[!] last iterations in one go + xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$47, $cnt # 31 + 512 % 31 + #call __ab_approximation_31 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + #xor @acc[1], @acc[1] # |a_hi| + mov 8*4($in_ptr), @acc[2] # |b_lo| + #xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + #mov $f1, 8*2(%rsp) + #mov $g1, 8*3(%rsp) + + #mov 8*0(%rsp), $f0 # |f0| + #mov 8*1(%rsp), $g0 # |g0| + lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulq_256x63 + + #mov 8*2(%rsp), $f0 # |f1| + #mov 8*3(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original |out_ptr| + call __smulq_512x63 + adc %rbp, %rdx # the excess limb of the result + + mov 8*5(%rsp), $in_ptr # original |nx_ptr| + mov %rdx, %rax + sar \$63, %rdx # result's sign as mask + + mov %rdx, @acc[0] # mask |modulus| + mov %rdx, @acc[1] + and 8*0($in_ptr), @acc[0] + mov %rdx, @acc[2] + and 8*1($in_ptr), @acc[1] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), %rdx + + add @acc[0], @acc[4] # conditionally add |modulus|<<256 + adc @acc[1], @acc[5] + adc @acc[2], @acc[6] + adc %rdx, @acc[7] + adc \$0, %rax + + mov %rax, %rdx + neg %rax + or %rax, %rdx # excess bit or sign as mask + sar \$63, %rax # excess bit as mask + + mov %rdx, @acc[0] # mask |modulus| + mov %rdx, @acc[1] + and 8*0($in_ptr), @acc[0] + mov %rdx, @acc[2] + and 8*1($in_ptr), @acc[1] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), %rdx + + xor %rax, @acc[0] # conditionally negate |modulus| + xor %rcx, %rcx + xor %rax, @acc[1] + sub %rax, %rcx + xor %rax, @acc[2] + xor %rax, %rdx + add %rcx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, %rdx + + add @acc[0], @acc[4] # final adjustment for |modulus|<<256 + adc @acc[1], @acc[5] + adc @acc[2], @acc[6] + adc %rdx, @acc[7] + + mov @acc[4], 8*4($out_ptr) # store absolute value + mov @acc[5], 8*5($out_ptr) + mov @acc[6], 8*6($out_ptr) + mov @acc[7], 8*7($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +___ +######################################################################## +# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers +# to the maximum bit-length of the *result*, and "63" - to the maximum +# bit-length of the |f?| and |g?| single-limb multiplicands. However! +# The latter should not be taken literally, as they are always chosen so +# that "bad things" don't happen. For example, there comes a point when +# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we +# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is +# because past that point |f0| is always 1 and |g0| is always 0. And, +# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to +# perform full-width |u|*|f1| multiplication, half-width one with sign +# extension is sufficient... +$code.=<<___; +.type __smulq_512x63,\@abi-omnipotent +.align 32 +__smulq_512x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), %rbp # sign limb + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit + + xor $f0, %rbx # conditionally negate |f0| + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |u| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, %rbp + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, %rbp + + mulq %rbx # |u|*|f0| + mov %rax, 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov @acc[$i], 8*$i($out_ptr) + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + and %rbx, %rbp + neg %rbp + mulq %rbx + add %rax, @acc[3] + adc %rdx, %rbp + mov @acc[3], 8*3($out_ptr) + + mov 8*5($in_ptr), @acc[0] # load |v| + mov 8*6($in_ptr), @acc[1] + mov 8*7($in_ptr), @acc[2] + mov 8*8($in_ptr), @acc[3] + mov 8*9($in_ptr), @acc[4] + mov 8*10($in_ptr), @acc[5] + mov 8*11($in_ptr), @acc[6] + mov 8*12($in_ptr), @acc[7] + + mov $g0, $f0 + sar \$63, $f0 # |g0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |g0|'s sign as bit + + xor $f0, $g0 # conditionally negate |g0| + add %rax, $g0 + + xor $f0, @acc[0] # conditionally negate |v| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + xor $f0, @acc[6] + xor $f0, @acc[7] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + + mulq $g0 + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<7; $i++) { +$code.=<<___; + mulq $g0 + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + imulq $g0 + add %rax, @acc[7] + adc \$0, %rdx # used in the final step + + mov %rbp, %rbx + sar \$63, %rbp # sign extension + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc %rbx, @acc[4] + adc %rbp, @acc[5] + adc %rbp, @acc[6] + adc %rbp, @acc[7] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + mov @acc[6], 8*6($out_ptr) + mov @acc[7], 8*7($out_ptr) + + ret +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,\@abi-omnipotent +.align 32 +__smulq_256x63: +___ +for($j=0; $j<2; $j++) { +my $k = 8*5*$j; +my @acc=@acc; @acc=@acc[4..7] if($j); +my $top="%rbp"; $top=$g0 if($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), $top # sign/excess limb + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $f0, %rbx # conditionally negate |f0| + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |u| (or |v|) + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, $top + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, $top + + mulq %rbx + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + and %rbx, $top + neg $top + mulq %rbx + add %rax, @acc[3] + adc %rdx, $top +___ +$code.=<<___ if ($j==0); + mov $g0, $f0 +___ +} +$code.=<<___; + add @acc[4], @acc[0] # accumulate |u|*|f0| + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + adc %rcx, %rbp + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov %rbp, 8*4($out_ptr) + + ret +.size __smulq_256x63,.-__smulq_256x63 +___ +######################################################################## +# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of +# the names refers to maximum bit-lengths of |a| and |b|. As already +# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always +# chosen so that "bad things" don't happen. For example, so that the +# sum of the products doesn't overflow, and that the final result is +# never wider than inputs... +{ +$code.=<<___; +.type __smulq_256_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulq_256_n_shift_by_31: + mov $f0, 8*0($out_ptr) # offload |f0| + mov $g0, 8*1($out_ptr) # offload |g0| + mov $f0, %rbp +___ +for($j=0; $j<2; $j++) { +my $k = 8*4*$j; +my @acc=@acc; @acc=@acc[4..7] if ($j); +my $f0="%rbp"; $f0=$g0 if ($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $f0, %rbx # conditionally negate |f0| (or |g0|) + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |a| (or |b|) + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + + mulq %rbx + mov %rax, @acc[0] + mov @acc[1], %rax + and %rbx, $f0 + neg $f0 + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + mulq %rbx + add %rax, @acc[3] + adc %rdx, $f0 +___ +} +$code.=<<___; + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + adc $g0, %rbp + + mov 8*0($out_ptr), $f0 # restore original |f0| + mov 8*1($out_ptr), $g0 # restore original |g0| + + shrd \$31, @acc[1], @acc[0] + shrd \$31, @acc[2], @acc[1] + shrd \$31, @acc[3], @acc[2] + shrd \$31, %rbp, @acc[3] + + sar \$63, %rbp # sign as mask + xor %rax, %rax + sub %rbp, %rax # sign as bit + + xor %rbp, @acc[0] # conditionally negate the result + xor %rbp, @acc[1] + xor %rbp, @acc[2] + xor %rbp, @acc[3] + add %rax, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + + xor %rbp, $f0 # conditionally negate |f0| + xor %rbp, $g0 # conditionally negate |g0| + add %rax, $f0 + add %rax, $g0 + + ret +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +___ +} + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); +my ($a_, $b_) = ($a_lo, $b_lo); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_31_256,\@abi-omnipotent +.align 32 +__ab_approximation_31_256: + mov 8*3($in_ptr), @a[2] # load |a| in reverse order + mov 8*7($in_ptr), @b[2] # load |b| in reverse order + mov 8*2($in_ptr), @a[1] + mov 8*6($in_ptr), @b[1] + mov 8*1($in_ptr), @a[0] + mov 8*5($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*0($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*4($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[2] + cmovz @b[0], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + mov \$0x7FFFFFFF, %eax + and %rax, @a[0] + and %rax, @b[0] + not %rax + and %rax, @a[2] + and %rax, @b[2] + or @a[2], @a[0] + or @b[2], @b[0] + + jmp __inner_loop_31_256 + + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +___ +} +$code.=<<___; +.type __inner_loop_31_256,\@abi-omnipotent +.align 32 # comment and punish Coffee Lake by up to 40% +__inner_loop_31_256: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + mov \$0x7FFFFFFF7FFFFFFF, $bias + +.Loop_31_256: + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + mov $fg0, $t2 + mov $fg1, $t3 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + + shr \$1, $a_ # |a_|>>=1 + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + sub $bias, $fg1 + sub \$1, $cnt + jnz .Loop_31_256 + + shr \$32, $bias + mov %ecx, %edx # $fg0, $f0 + mov ${fg1}d, ${f1}d + shr \$32, $g0 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,\@abi-omnipotent +.align 32 +__inner_loop_62_256: + mov $cnt, %r15d + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov $f0, $g1 # |g1|=1 + mov $f0, %r14 + +.Loop_62_256: + xor $t0, $t0 + test %r14, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t1 + cmovnz $b_lo, $t0 + sub $a_lo, $t1 # |b_|-|a_| + mov $a_lo, $t2 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t2, $b_lo # |b_| = |a_| + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shr \$1, $a_lo + test %r14, $t2 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, %r15d + jnz .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl b/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl new file mode 100755 index 00000000000..268bf9d2546 --- /dev/null +++ b/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl @@ -0,0 +1,610 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >12x better [on +# Cortex cores] than modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 62 + w = 64 + mask = (1 << w) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_62 + n = max(a.bit_length(), b.bit_length()) + if n < 128: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_62 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smul_383_n_shift_by_62 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smul_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); +my @acc=map("x$_",(3..14)); +my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21)); +my $cnt = $n_ptr; +my @t = map("x$_",(22..28,2)); +my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11]; + +$frame = 16+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + paciasp + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #$frame + + ldp @t[0], @acc[1], [$in_ptr,#8*0] + ldp @acc[2], @acc[3], [$in_ptr,#8*2] + ldp @acc[4], @acc[5], [$in_ptr,#8*4] + + add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot + and $in_ptr, $in_ptr, #-512 // in the frame... + stp $out_ptr, $nx_ptr, [sp] + + ldp @acc[6], @acc[7], [$n_ptr,#8*0] + ldp @acc[8], @acc[9], [$n_ptr,#8*2] + ldp @acc[10], @acc[11], [$n_ptr,#8*4] + + stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*2] + stp @acc[4], @acc[5], [$in_ptr,#8*4] + stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b| + stp @acc[8], @acc[9], [$in_ptr,#8*8] + stp @acc[10], @acc[11], [$in_ptr,#8*10] + + ////////////////////////////////////////// first iteration + mov $cnt, #62 + bl .Lab_approximation_62_loaded + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str $f0,[$out_ptr,#8*12] // initialize |u| with |f0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str $f0, [$out_ptr,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + bl __ab_approximation_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr @acc[4], [$in_ptr,#8*12] // |u| + ldr @acc[5], [$in_ptr,#8*18] // |v| + mul @acc[0], $f_, @acc[4] // |u|*|f0| + smulh @acc[1], $f_, @acc[4] + mul @acc[2], $g_, @acc[5] // |v|*|g0| + smulh @acc[3], $g_, @acc[5] + adds @acc[0], @acc[0], @acc[2] + adc @acc[1], @acc[1], @acc[3] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + asr @acc[2], @acc[1], #63 // sign extenstion + stp @acc[2], @acc[2], [$out_ptr,#8*8] + stp @acc[2], @acc[2], [$out_ptr,#8*10] + + mul @acc[0], $f0, @acc[4] // |u|*|f1| + smulh @acc[1], $f0, @acc[4] + mul @acc[2], $g0, @acc[5] // |v|*|g1| + smulh @acc[3], $g0, @acc[5] + adds @acc[0], @acc[0], @acc[2] + adc @acc[1], @acc[1], @acc[3] + stp @acc[0], @acc[1], [$out_ptr,#8*12] + asr @acc[2], @acc[1], #63 // sign extenstion + stp @acc[2], @acc[2], [$out_ptr,#8*14] + stp @acc[2], @acc[2], [$out_ptr,#8*16] +___ +for($i=2; $i<11; $i++) { +$code.=<<___; + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + bl __ab_approximation_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add $out_ptr, $out_ptr, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov $f_, $f0 // corrected |f1| + mov $g_, $g0 // corrected |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |v| + bl __smul_383x63 +___ +$code.=<<___ if ($i>5); + bl __smul_767x63_tail +___ +$code.=<<___ if ($i==5); + asr @t[5], @t[5], #63 // sign extension + stp @t[5], @t[5], [$out_ptr,#8*6] + stp @t[5], @t[5], [$out_ptr,#8*8] + stp @t[5], @t[5], [$out_ptr,#8*10] +___ +} +$code.=<<___; + ////////////////////////////////////////// iteration before last + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load + ldp $b_lo, $b_hi, [$in_ptr,#8*6] + bl __inner_loop_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + str $a_lo, [$out_ptr,#8*0] + str $b_lo, [$out_ptr,#8*6] + + mov $f_, $f0 // exact |f0| + mov $g_, $g0 // exact |g0| + mov $f0, $f1 + mov $g0, $g1 + add $out_ptr, $out_ptr, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov $f_, $f0 // exact |f1| + mov $g_, $g0 // exact |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr $a_lo, [$in_ptr,#8*0] // just load + eor $a_hi, $a_hi, $a_hi + ldr $b_lo, [$in_ptr,#8*6] + eor $b_hi, $b_hi, $b_hi + bl __inner_loop_62 + + mov $f_, $f1 + mov $g_, $g1 + ldp $out_ptr, $f0, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr @t[0], @acc[5], #63 // sign as mask + ldp @acc[6], @acc[7], [$f0,#8*0] + ldp @acc[8], @acc[9], [$f0,#8*2] + ldp @acc[10], @acc[11], [$f0,#8*4] + + and @acc[6], @acc[6], @t[0] // add mod<<384 conditionally + and @acc[7], @acc[7], @t[0] + adds @acc[0], @acc[0], @acc[6] + and @acc[8], @acc[8], @t[0] + adcs @acc[1], @acc[1], @acc[7] + and @acc[9], @acc[9], @t[0] + adcs @acc[2], @acc[2], @acc[8] + and @acc[10], @acc[10], @t[0] + adcs @acc[3], @acc[3], @acc[9] + and @acc[11], @acc[11], @t[0] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + adcs @acc[4], @acc[4], @acc[10] + stp @acc[2], @acc[3], [$out_ptr,#8*8] + adc @acc[5], @acc[5], @acc[11] + stp @acc[4], @acc[5], [$out_ptr,#8*10] + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + autiasp + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: +___ +for($j=0; $j<2; $j++) { +my $f_ = $f_; $f_ = $g_ if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*12+8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) + asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) + sub $f_, $f_, $f1 + eor @acc[1], @acc[1], $f1 + adds @acc[0], @acc[0], $f1, lsr#63 + eor @acc[2], @acc[2], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], $f1 + adcs @acc[3], @acc[3], xzr + umulh @t[0], @acc[0], $f_ + eor @acc[5], @acc[5], $f1 + umulh @t[1], @acc[1], $f_ + adcs @acc[4], @acc[4], xzr + umulh @t[2], @acc[2], $f_ + adcs @acc[5], @acc[5], xzr + umulh @t[3], @acc[3], $f_ +___ +$code.=<<___ if ($j); + adc $g1, xzr, xzr // used in __smul_767x63_tail +___ +$code.=<<___; + umulh @t[4], @acc[4], $f_ + mul @acc[0], @acc[0], $f_ + mul @acc[1], @acc[1], $f_ + mul @acc[2], @acc[2], $f_ + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], $f_ + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], $f_ + adcs @acc[3], @acc[3], @t[2] + mul @t[5+$j],@acc[5], $f_ + adcs @acc[4], @acc[4], @t[3] + adcs @t[5+$j],@t[5+$j],@t[4] +___ +$code.=<<___ if ($j==0); + adc @t[7], xzr, xzr +___ +} +$code.=<<___; + adc @t[7], @t[7], xzr + + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], @acc[10] + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adcs @t[5], @t[5], @t[6] + stp @acc[4], @t[5], [$out_ptr,#8*4] + adc @t[6], @t[7], xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh @t[5], @acc[5], $f_ + ldp @acc[0], @acc[1], [$in_ptr,#8*24] // load rest of |v| + umulh @acc[11],@acc[11], $g_ + ldp @acc[2], @acc[3], [$in_ptr,#8*26] + ldp @acc[4], @acc[5], [$in_ptr,#8*28] + + eor @acc[0], @acc[0], $f1 // conditionally negate rest of |v| + eor @acc[1], @acc[1], $f1 + eor @acc[2], @acc[2], $f1 + adds @acc[0], @acc[0], $g1 + eor @acc[3], @acc[3], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[4], @acc[4], $f1 + adcs @acc[2], @acc[2], xzr + eor @acc[5], @acc[5], $f1 + adcs @acc[3], @acc[3], xzr + umulh @t[0], @acc[0], $g_ + adcs @acc[4], @acc[4], xzr + umulh @t[1], @acc[1], $g_ + adc @acc[5], @acc[5], xzr + + umulh @t[2], @acc[2], $g_ + add @acc[11], @acc[11], @t[6] + umulh @t[3], @acc[3], $g_ + asr @t[6], @t[5], #63 + umulh @t[4], @acc[4], $g_ + mul @acc[0], @acc[0], $g_ + mul @acc[1], @acc[1], $g_ + mul @acc[2], @acc[2], $g_ + adds @acc[0], @acc[0], @acc[11] + mul @acc[3], @acc[3], $g_ + adcs @acc[1], @acc[1], @t[0] + mul @acc[4], @acc[4], $g_ + adcs @acc[2], @acc[2], @t[1] + mul @acc[5], @acc[5], $g_ + adcs @acc[3], @acc[3], @t[2] + adcs @acc[4], @acc[4], @t[3] + adc @acc[5], @acc[5], @t[4] + + adds @acc[0], @acc[0], @t[5] + adcs @acc[1], @acc[1], @t[6] + adcs @acc[2], @acc[2], @t[6] + adcs @acc[3], @acc[3], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + adcs @acc[4], @acc[4], @t[6] + stp @acc[2], @acc[3], [$out_ptr,#8*8] + adc @acc[5], @acc[5], @t[6] + stp @acc[4], @acc[5], [$out_ptr,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: +___ +for($j=0; $j<2; $j++) { +my $f0 = $f0; $f0 = $g0 if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) + asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|) + sub @t[7], @t[7], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + umulh @t[0], @acc[0], @t[7] + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], @t[7] + eor @acc[5], @acc[5], @t[6] + umulh @t[2], @acc[2], @t[7] + adcs @acc[4], @acc[4], xzr + umulh @t[3], @acc[3], @t[7] + adc @acc[5], @acc[5], xzr + + umulh @t[4], @acc[4], @t[7] + smulh @t[5+$j], @acc[5], @t[7] + mul @acc[0], @acc[0], @t[7] + mul @acc[1], @acc[1], @t[7] + mul @acc[2], @acc[2], @t[7] + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], @t[7] + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], @t[7] + adcs @acc[3], @acc[3], @t[2] + mul @acc[5], @acc[5], @t[7] + adcs @acc[4], @acc[4], @t[3] + adcs @acc[5], @acc[5] ,@t[4] + adc @t[5+$j], @t[5+$j], xzr +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + adcs @acc[4], @acc[4], @acc[10] + adcs @acc[5], @acc[5], @acc[11] + adc @acc[6], @t[5], @t[6] + + extr @acc[0], @acc[1], @acc[0], #62 + extr @acc[1], @acc[2], @acc[1], #62 + extr @acc[2], @acc[3], @acc[2], #62 + asr @t[6], @acc[6], #63 + extr @acc[3], @acc[4], @acc[3], #62 + extr @acc[4], @acc[5], @acc[4], #62 + extr @acc[5], @acc[6], @acc[5], #62 + + eor @acc[0], @acc[0], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + adcs @acc[3], @acc[3], xzr + eor @acc[5], @acc[5], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adc @acc[5], @acc[5], xzr + stp @acc[4], @acc[5], [$out_ptr,#8*4] + + eor $f0, $f0, @t[6] + eor $g0, $g0, @t[6] + sub $f0, $f0, @t[6] + sub $g0, $g0, @t[6] + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +___ + +{ +my @a = @acc[0..5]; +my @b = @acc[6..11]; + +$code.=<<___; +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp @a[4], @a[5], [$in_ptr,#8*4] + ldp @b[4], @b[5], [$in_ptr,#8*10] + ldp @a[2], @a[3], [$in_ptr,#8*2] + ldp @b[2], @b[3], [$in_ptr,#8*8] + +.Lab_approximation_62_loaded: + orr @t[0], @a[5], @b[5] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[3], ne + orr @t[0], @a[5], @b[5] // ... ones before top-most, ... + csel @b[4], @b[4], @b[3], ne + + ldp @a[0], @a[1], [$in_ptr,#8*0] + ldp @b[0], @b[1], [$in_ptr,#8*6] + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[2], ne + orr @t[0], @a[5], @b[5] // ... and ones before that ... + csel @b[4], @b[4], @b[2], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[1], ne + orr @t[0], @a[5], @b[5] + csel @b[4], @b[4], @b[1], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + neg @t[1], @t[0] + + lslv @a[5], @a[5], @t[0] // align high limbs to the left + lslv @b[5], @b[5], @t[0] + lsrv @a[4], @a[4], @t[1] + lsrv @b[4], @b[4], @t[1] + and @a[4], @a[4], @t[1], asr#6 + and @b[4], @b[4], @t[1], asr#6 + orr @a[5], @a[5], @a[4] + orr @b[5], @b[5], @b[4] + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +___ +} +$code.=<<___; +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov $f0, #1 // |f0|=1 + mov $g0, #0 // |g0|=0 + mov $f1, #0 // |f1|=0 + mov $g1, #1 // |g1|=1 + +.Loop_62: + sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + subs @t[2], $b_lo, $a_lo // |b_|-|a_| + and @t[0], $b_lo, @t[6] + sbc @t[3], $b_hi, $a_hi + and @t[1], $b_hi, @t[6] + subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $f0 + sbcs @t[5], $a_hi, @t[1] + mov @t[1], $g0 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $b_hi, $b_hi, $a_hi, hs + csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $a_hi, @t[5], @t[3], hs + csel $f0, $f0, $f1, hs // exchange |f0| and |f1| + csel $f1, $f1, @t[0], hs + csel $g0, $g0, $g1, hs // exchange |g0| and |g1| + csel $g1, $g1, @t[1], hs + extr $a_lo, $a_hi, $a_lo, #1 + lsr $a_hi, $a_hi, #1 + and @t[0], $f1, @t[6] + and @t[1], $g1, @t[6] + add $f1, $f1, $f1 // |f1|<<=1 + add $g1, $g1, $g1 // |g1|<<=1 + sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) + cbnz $cnt, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 +___ + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl b/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl new file mode 100755 index 00000000000..4128dc3236d --- /dev/null +++ b/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl @@ -0,0 +1,401 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast quadratic residue test as suggested in +# https://eprint.iacr.org/2020/972. Performance is >12x better [on +# Cortex cores] than modulus-specific Legendre symbol addition chain... +# +# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_is_square_mod_384(inp, mod): + a = inp + b = mod + L = 0 # only least significant bit, adding 1 makes up for sign change + + k = 30 + w = 32 + mask = (1 << w) - 1 + + for i in range(0, 768 // k - 1): + # __ab_approximation_30 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_30 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + # __smulq_384_n_shift_by_30 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if b < 0: + b = -b + if a < 0: + a = -a + L += (b % 4) >> 1 # |b| is always odd, the second bit + # tells the whole story + + if True: + for j in range(0, 768 % k + k): + if a & 1: + if a < b: + a, b = b, a + L += (a & b) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a = a-b + a = a >> 1 + L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + return (L & 1) ^ 1 +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2)); +my @acc=map("x$_",(3..14)); +my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20)); +my @t = map("x$_",(21..28)); +my ($a_, $b_) = @acc[5,11]; + +$frame = 2*256; + +$code.=<<___; +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + paciasp + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #$frame + + ldp @acc[0], @acc[1], [x0,#8*0] // load input + ldp @acc[2], @acc[3], [x0,#8*2] + ldp @acc[4], @acc[5], [x0,#8*4] + + add $in_ptr, sp, #255 // find closest 256-byte-aligned spot + and $in_ptr, $in_ptr, #-256 // in the frame... + + ldp @acc[6], @acc[7], [x1,#8*0] // load modulus + ldp @acc[8], @acc[9], [x1,#8*2] + ldp @acc[10], @acc[11], [x1,#8*4] + + stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*8] + stp @acc[4], @acc[5], [$in_ptr,#8*10] + stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b| + stp @acc[8], @acc[9], [$in_ptr,#8*2] + stp @acc[10], @acc[11], [$in_ptr,#8*4] + + eor $L, $L, $L // init the Legendre symbol + mov $cnt, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub $cnt, $cnt, #1 + + eor $out_ptr, $in_ptr, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov $f1, $f0 // |f0| + mov $g1, $g0 // |g0| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp @acc[6], @acc[7], [$out_ptr,#-8*6] + eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b| + and @t[6], @t[6], @acc[6] // if |a| was negative, + add $L, $L, @t[6], lsr#1 // adjust |L| + + cbnz $cnt, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr $a_, [$in_ptr,#8*6] // and loaded + //ldr $b_, [$in_ptr,#8*0] + mov $cnt, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, $L, #1 + eor x0, x0, #1 + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + autiasp + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: +___ +for($j=0; $j<2; $j++) { +my $fx = $g1; $fx = $f1 if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|) + asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|) + sub $fx, $fx, @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + umulh @t[0], @acc[0], $fx + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $fx + eor @acc[5], @acc[5], @t[6] + umulh @t[2], @acc[2], $fx + adcs @acc[4], @acc[4], xzr + umulh @t[3], @acc[3], $fx + adc @acc[5], @acc[5], xzr + + umulh @t[4], @acc[4], $fx + and @t[7], $fx, @t[6] + umulh @t[5+$j], @acc[5], $fx + neg @t[7], @t[7] + mul @acc[0], @acc[0], $fx + mul @acc[1], @acc[1], $fx + mul @acc[2], @acc[2], $fx + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], $fx + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], $fx + adcs @acc[3], @acc[3], @t[2] + mul @acc[5], @acc[5], $fx + adcs @acc[4], @acc[4], @t[3] + adcs @acc[5], @acc[5] ,@t[4] + adc @t[5+$j], @t[5+$j], @t[7] +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + adcs @acc[4], @acc[4], @acc[10] + adcs @acc[5], @acc[5], @acc[11] + adc @acc[6], @t[5], @t[6] + + extr @acc[0], @acc[1], @acc[0], #30 + extr @acc[1], @acc[2], @acc[1], #30 + extr @acc[2], @acc[3], @acc[2], #30 + asr @t[6], @acc[6], #63 + extr @acc[3], @acc[4], @acc[3], #30 + extr @acc[4], @acc[5], @acc[4], #30 + extr @acc[5], @acc[6], @acc[5], #30 + + eor @acc[0], @acc[0], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + adcs @acc[3], @acc[3], xzr + eor @acc[5], @acc[5], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adc @acc[5], @acc[5], xzr + stp @acc[4], @acc[5], [$out_ptr,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +___ + +{ +my @a = @acc[0..5]; +my @b = @acc[6..11]; +my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]); + +$code.=<<___; +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers + ldp @b[2], @b[3], [$in_ptr,#8*2] + + orr @t[0], @a[5], @b[5] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[3], ne + orr @t[0], @a[5], @b[5] // ... ones before top-most, ... + csel @b[4], @b[4], @b[3], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[2], ne + orr @t[0], @a[5], @b[5] // ... and ones before that ... + csel @b[4], @b[4], @b[2], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[1], ne + orr @t[0], @a[5], @b[5] // and one more, ... + csel @b[4], @b[4], @b[1], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[0], ne + orr @t[0], @a[5], @b[5] + csel @b[4], @b[4], @b[0], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + neg @t[1], @t[0] + + lslv @a[5], @a[5], @t[0] // align high limbs to the left + lslv @b[5], @b[5], @t[0] + lsrv @a[4], @a[4], @t[1] + lsrv @b[4], @b[4], @t[1] + and @a[4], @a[4], @t[1], asr#6 + and @b[4], @b[4], @t[1], asr#6 + orr $a_, @a[5], @a[4] + orr $b_, @b[5], @b[4] + + bfxil $a_, @a[0], #0, #32 + bfxil $b_, @b[0], #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov $cnt, #30 + mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov $bias,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting + and @t[4], $a_, $b_ + sub $cnt, $cnt, #1 + and @t[0], $b_, @t[3] + + sub @t[1], $b_, $a_ // |b_|-|a_| + subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1 + mov @t[0], $fg1 + csel $b_, $b_, $a_, hs // |b_| = |a_| + csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| + csel $fg0, $fg0, @t[0], hs + csel $L, $L, @t[4], hs + lsr $a_, $a_, #1 + and @t[0], $fg1, @t[3] + and @t[1], $bias, @t[3] + add $t[2], $b_, #2 + sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add $fg1, $fg1, $fg1 // |f1|<<=1 + add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add $fg0, $fg0, @t[1] + sub $fg1, $fg1, $bias + + cbnz $cnt, .Loop_30 + + mov $bias, #0x7FFFFFFF + ubfx $f0, $fg0, #0, #32 + ubfx $g0, $fg0, #32, #32 + ubfx $f1, $fg1, #0, #32 + ubfx $g1, $fg1, #32, #32 + sub $f0, $f0, $bias // remove the bias + sub $g0, $g0, $bias + sub $f1, $f1, $bias + sub $g1, $g1, $bias + + ret +.size __inner_loop_30,.-__inner_loop_30 +___ +} + +{ +my ($a_, $b_) = (@acc[0], @acc[6]); +$code.=<<___; +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting + and @t[4], $a_, $b_ + sub $cnt, $cnt, #1 + and @t[0], $b_, @t[3] + sub @t[1], $b_, $a_ // |b_|-|a_| + subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + add @t[4], $L, @t[4], lsr#1 + csel $b_, $b_, $a_, hs // |b_| = |a_| + csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $L, $L, @t[4], hs + add $t[2], $b_, #2 + lsr $a_, $a_, #1 + add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz $cnt, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl b/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl new file mode 100755 index 00000000000..40016ed70d2 --- /dev/null +++ b/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl @@ -0,0 +1,494 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast quadratic residue test as suggested in +# https://eprint.iacr.org/2020/972. Performance is >5x better than +# modulus-specific Legendre symbol addition chain... +# +# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_is_square_mod_384(inp, mod): + a = inp + b = mod + L = 0 # only least significant bit, adding 1 makes up for sign change + + k = 30 + w = 32 + mask = (1 << w) - 1 + + for i in range(0, 768 // k - 1): + # __ab_approximation_30 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_30 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + # __smulq_384_n_shift_by_30 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if b < 0: + b = -b + if a < 0: + a = -a + L += (b % 4) >> 1 # |b| is always odd, the second bit + # tells the whole story + + if True: + for j in range(0, 768 % k + k): + if a & 1: + if a < b: + a, b = b, a + L += (a & b) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a = a-b + a = a >> 1 + L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + return (L & 1) ^ 1 +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr) = ("%rdi", "%rsi"); +my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx"); +my @acc=map("%r$_",(8..15)); +my $L = "%rbp"; + +$frame = 8*3+2*256; + +$code.=<<___; +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384,\@function,2,"unwind" +.align 32 +ct_is_square_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot + and \$-256, %rax # in the frame... + + mov 8*0(%rdi), @acc[0] # load input + mov 8*1(%rdi), @acc[1] + mov 8*2(%rdi), @acc[2] + mov 8*3(%rdi), @acc[3] + mov 8*4(%rdi), @acc[4] + mov 8*5(%rdi), @acc[5] + + mov 8*0(%rsi), @acc[6] # load modulus + mov 8*1(%rsi), @acc[7] + mov 8*2(%rsi), %rbx + mov 8*3(%rsi), %rcx + mov 8*4(%rsi), %rdx + mov 8*5(%rsi), %rdi + mov %rax, $in_ptr # pointer to source |a|b| + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov %rbx, 8*8(%rax) + mov %rcx, 8*9(%rax) + mov %rdx, 8*10(%rax) + mov %rdi, 8*11(%rax) + + xor $L, $L # initialize the Legendre symbol + mov \$24, %ecx # 24 is 768/30-1 + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + mov %ecx, 8*2(%rsp) # offload loop counter + + call __ab_approximation_30 + mov $f0, 8*0(%rsp) # offload |f0| and |g0| + mov $g0, 8*1(%rsp) + + mov \$128+8*6, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |b| + call __smulq_384_n_shift_by_30 + + mov 8*0(%rsp), $f1 # pop |f0| and |g0| + mov 8*1(%rsp), $g1 + lea -8*6($out_ptr),$out_ptr # pointer to destination |a| + call __smulq_384_n_shift_by_30 + + mov 8*2(%rsp), %ecx # re-load loop counter + xor \$128, $in_ptr # flip-flop pointer to source |a|b| + + and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L| + shr \$1, @acc[6] + add @acc[6], $L + + sub \$1, %ecx + jnz .Loop_is_square + + ################################# last iteration + #call __ab_approximation_30 # |a| and |b| are exact, just load + #mov 8*0($in_ptr), @acc[0] # |a_| + mov 8*6($in_ptr), @acc[1] # |b_| + call __inner_loop_48 # 48 is 768%30+30 + + mov \$1, %rax + and $L, %rax + xor \$1, %rax # return value + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,\@abi-omnipotent +.align 32 +__smulq_384_n_shift_by_30: +___ +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, %rbx # |f1| (or |g1|) + sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s) + xor %rax, %rax + sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s) + + xor %rdx, %rbx # conditionally negate |f1| (or |g1|) + add %rax, %rbx + + xor %rdx, @acc[0] # conditionally negate |a| (or |b|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov %rdx, @acc[6+$j] + and %rbx, @acc[6+$j] + mulq %rbx # |a|*|f1| (or |b|*|g1|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + neg @acc[6+$j] + mulq %rbx + add %rax, @acc[5] + adc %rdx, @acc[6+$j] +___ +$code.=<<___ if ($j==0); + lea 8*6($in_ptr), $in_ptr # pointer to |b| + mov $g1, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + adc @acc[7], @acc[6] + + shrd \$30, @acc[1], @acc[0] + shrd \$30, @acc[2], @acc[1] + shrd \$30, @acc[3], @acc[2] + shrd \$30, @acc[4], @acc[3] + shrd \$30, @acc[5], @acc[4] + shrd \$30, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor %rbx, %rbx + sub @acc[6], %rbx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add %rbx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +___ +{ +my ($a_, $b_) = @acc[0..1]; +my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15)); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t5); +my $cnt = "%edi"; +{ +my @a = @acc[0..5]; +my @b = (@a[1..3], $t4, $t5, $g0); + +$code.=<<___; +.type __ab_approximation_30,\@abi-omnipotent +.align 32 +__ab_approximation_30: + mov 8*11($in_ptr), @b[5] # load |b| in reverse order + mov 8*10($in_ptr), @b[4] + mov 8*9($in_ptr), @b[3] + + mov @a[5], %rax + or @b[5], %rax # check top-most limbs, ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[3], @a[4] + mov 8*8($in_ptr), @b[2] + cmovz @b[3], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... ones before top-most, ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[2], @a[4] + mov 8*7($in_ptr), @b[1] + cmovz @b[2], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... and ones before that ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[1], @a[4] + mov 8*6($in_ptr), @b[0] + cmovz @b[1], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... and ones before that ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[0], @a[4] + cmovz @b[0], @b[4] + + mov @a[5], %rax + or @b[5], %rax + bsr %rax, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[5] + cmovz @b[0], @b[5] + cmovz %rax, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[4], @a[5] # align second limb to the left + shldq %cl, @b[4], @b[5] + + mov \$0xFFFFFFFF00000000, %rax + mov @a[0]d, ${a_}d + mov @b[0]d, ${b_}d + and %rax, @a[5] + and %rax, @b[5] + or @a[5], ${a_} + or @b[5], ${b_} + + jmp __inner_loop_30 + + ret +.size __ab_approximation_30,.-__ab_approximation_30 +___ +} +$code.=<<___; +.type __inner_loop_30,\@abi-omnipotent +.align 32 +__inner_loop_30: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF + mov \$30, $cnt + +.Loop_30: + mov $a_, %rax + and $b_, %rax + shr \$1, %rax # (a_ & b_) >> 1 + + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + lea (%rax,$L), %rax # pre-"negate" |L| + mov $fg0, $t2 + mov $fg1, $t3 + mov $L, $t4 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + cmovb %rax, $L + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + cmovz $t4, $L + + lea 2($b_), %rax + shr \$1, $a_ # |a_|>>=1 + shr \$2, %rax + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5 + sub $bias, $fg1 + + sub \$1, $cnt + jnz .Loop_30 + + shr \$32, $bias + mov %ebx, %eax # $fg0 -> $f0 + shr \$32, $g0 + mov %ecx, %edx # $fg1 -> $f1 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,\@abi-omnipotent +.align 32 +__inner_loop_48: + mov \$48, $cnt # 48 is 768%30+30 + +.Loop_48: + mov $a_, %rax + and $b_, %rax + shr \$1, %rax # (a_ & b_) >> 1 + + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + lea (%rax,$L), %rax + mov $L, $t2 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb %rax, $L + + sub $b_, $a_ # |a_|-|b_| + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $L + + lea 2($b_), %rax + shr \$1, $a_ # |a_|>>=1 + shr \$2, %rax + add %rax, $L # "negate" |L| if |b|%8 is 3 or 5 + + sub \$1, $cnt + jnz .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl b/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl new file mode 100755 index 00000000000..2be39d8ba8b --- /dev/null +++ b/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl @@ -0,0 +1,886 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >5x better than +# modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 62 + w = 64 + mask = (1 << w) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_62 + n = max(a.bit_length(), b.bit_length()) + if n < 128: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_62 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulq_383_n_shift_by_62 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulq_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edi"; + +$frame = 8*11+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383,\@function,4,"unwind" +.align 32 +ct_inverse_mod_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov 8*0($n_ptr), @acc[6] # load modulus + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + mov 8*3($n_ptr), @acc[9] + mov 8*4($n_ptr), @acc[10] + mov 8*5($n_ptr), @acc[11] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov @acc[8], 8*8(%rax) + mov @acc[9], 8*9(%rax) + mov @acc[10], 8*10(%rax) + mov %rax, $in_ptr # pointer to source |a|b|1|0| + mov @acc[11], 8*11(%rax) + + ################################# first iteration + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*7(%rsp) # corrected |f0| + #mov $g0, 8*8(%rsp) # corrected |g0| + mov $f0, 8*12($out_ptr) # initialize |u| with |f0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + mov $f0, 8*12($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*12($in_ptr), %rax # |u| + mov 8*18($in_ptr), @acc[3] # |v| + mov $f0, %rbx + mov %rax, @acc[2] + imulq 8*7(%rsp) # |u|*|f0| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq 8*8(%rsp) # |v|*|g0| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*6($out_ptr) # destination |u| + mov @acc[1], 8*7($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*8($out_ptr) + mov @acc[1], 8*9($out_ptr) + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor + + mov @acc[2], %rax + imulq %rbx # |u|*|f1| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq %rcx # |v|*|g1| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*12($out_ptr) # destination |v| + mov @acc[1], 8*13($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*14($out_ptr) + mov @acc[1], 8*15($out_ptr) + mov @acc[1], 8*16($out_ptr) + mov @acc[1], 8*17($out_ptr) +___ +for($i=2; $i<11; $i++) { +my $smul_767x63 = $i>5 ? "__smulq_767x63" + : "__smulq_383x63"; +$code.=<<___; + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + mov $f0, 8*9(%rsp) # corrected |f1| + mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*7(%rsp), $f0 # |f0| + mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + call __smulq_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call $smul_767x63 +___ +$code.=<<___ if ($i==5); + sar \$63, @acc[5] # sign extension + mov @acc[5], 8*6($out_ptr) + mov @acc[5], 8*7($out_ptr) + mov @acc[5], 8*8($out_ptr) + mov @acc[5], 8*9($out_ptr) + mov @acc[5], 8*10($out_ptr) + mov @acc[5], 8*11($out_ptr) +___ +} +$code.=<<___; + ################################# iteration before last + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + #call __ab_approximation_62 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + mov 8*1($in_ptr), @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + mov 8*7($in_ptr), @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + mov @acc[0], 8*0($out_ptr) + mov @acc[2], 8*6($out_ptr) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*12($out_ptr),$out_ptr # pointer to destination |u| + call __smulq_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call __smulq_767x63 + + ################################# last iteration + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$22, $cnt # 766 % 62 + #call __ab_approximation_62 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + xor @acc[1], @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + #mov $f1, 8*9(%rsp) + #mov $g1, 8*10(%rsp) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulq_383x63 + + #mov 8*9(%rsp), $f0 # |f1| + #mov 8*10(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original out_ptr + call __smulq_767x63 + + mov 8*5(%rsp), $in_ptr # original n_ptr + mov %rax, %rdx # top limb of the result + sar \$63, %rax # result's sign as mask + + mov %rax, @acc[0] # mask |modulus| + mov %rax, @acc[1] + mov %rax, @acc[2] + and 8*0($in_ptr), @acc[0] + and 8*1($in_ptr), @acc[1] + mov %rax, @acc[3] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), @acc[3] + mov %rax, @acc[4] + and 8*4($in_ptr), @acc[4] + and 8*5($in_ptr), %rax + + add @acc[0], @acc[6] # conditionally add |modulus|<<384 + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], %rcx + adc %rax, %rdx + + mov @acc[6], 8*6($out_ptr) # store absolute value + mov @acc[7], 8*7($out_ptr) + mov @acc[8], 8*8($out_ptr) + mov @acc[9], 8*9($out_ptr) + mov %rcx, 8*10($out_ptr) + mov %rdx, 8*11($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +___ +######################################################################## +# see corresponding commentary in ctx_inverse_mod_384-x86_64... +{ +my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); +my $fx = @acc[9]; + +$code.=<<___; +.type __smulq_767x63,\@abi-omnipotent +.align 32 +__smulq_767x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov $f0, $fx + sar \$63, $f0 # |f0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit + + mov $out_ptr, 8*1(%rsp) + mov $in_ptr, 8*2(%rsp) + lea 8*6($in_ptr), $in_ptr # pointer to |v| + + xor $f0, $fx # conditionally negate |f0| + add %rax, $fx + + xor $f0, @acc[0] # conditionally negate |u| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |u|*|f0| + mov %rax, 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] + mov @acc[$i], 8*$i($out_ptr) +___ +} +$code.=<<___; + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + mov @acc[5], 8*5($out_ptr) + mov %rdx, 8*6($out_ptr) + sar \$63, %rdx # sign extension + mov %rdx, 8*7($out_ptr) +___ +{ +my $fx=$in_ptr; +$code.=<<___; + mov $g0, $f0 # load |g0| + + mov 8*0($in_ptr), @acc[0] # load |v| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + mov 8*6($in_ptr), @acc[6] + mov 8*7($in_ptr), @acc[7] + mov 8*8($in_ptr), @acc[8] + mov 8*9($in_ptr), @acc[9] + mov 8*10($in_ptr), @acc[10] + mov 8*11($in_ptr), @acc[11] + + mov $f0, $fx # overrides in_ptr + sar \$63, $f0 # |g0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |g0|'s sign as bit + + xor $f0, $fx # conditionally negate |g0| + add %rax, $fx + + xor $f0, @acc[0] # conditionally negate |v| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + xor $f0, @acc[6] + xor $f0, @acc[7] + xor $f0, @acc[8] + xor $f0, @acc[9] + xor $f0, @acc[10] + xor $f0, @acc[11] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + adc \$0, @acc[8] + adc \$0, @acc[9] + adc \$0, @acc[10] + adc \$0, @acc[11] + + mulq $fx # |v|*|g0| + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<11; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + mov 8*1(%rsp), %rdx # out_ptr + imulq $fx, %rax + mov 8*2(%rsp), $in_ptr # restore original in_ptr + add @acc[11], %rax + + add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| + adc 8*1(%rdx), @acc[1] + adc 8*2(%rdx), @acc[2] + adc 8*3(%rdx), @acc[3] + adc 8*4(%rdx), @acc[4] + adc 8*5(%rdx), @acc[5] + adc 8*6(%rdx), @acc[6] + mov 8*7(%rdx), @acc[11] # sign extension + adc @acc[11], @acc[7] + adc @acc[11], @acc[8] + adc @acc[11], @acc[9] + adc @acc[11], @acc[10] + adc @acc[11], %rax + + mov %rdx, $out_ptr # restore original out_ptr + + mov @acc[0], 8*0(%rdx) + mov @acc[1], 8*1(%rdx) + mov @acc[2], 8*2(%rdx) + mov @acc[3], 8*3(%rdx) + mov @acc[4], 8*4(%rdx) + mov @acc[5], 8*5(%rdx) + mov @acc[6], 8*6(%rdx) + mov @acc[7], 8*7(%rdx) + mov @acc[8], 8*8(%rdx) + mov @acc[9], 8*9(%rdx) + mov @acc[10], 8*10(%rdx) + mov %rax, 8*11(%rdx) + + ret +.size __smulq_767x63,.-__smulq_767x63 +___ +} +$code.=<<___; +.type __smulq_383x63,\@abi-omnipotent +.align 32 +__smulq_383x63: +___ +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, $fx + sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor %rdx, $fx # conditionally negate |f0| + add %rax, $fx + + xor %rdx, @acc[0] # conditionally negate |u| (or |v|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |u|*|f0| (or |v|*|g0|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___ if ($j==0); + imulq $fx, %rax + add %rax, @acc[$i] + + lea 8*6($in_ptr), $in_ptr # pointer to |v| + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + imulq $fx, %rax + add %rax, @acc[$i] + + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulq_383x63,.-__smulq_383x63 +___ +{ +$code.=<<___; +.type __smulq_383_n_shift_by_62,\@abi-omnipotent +.align 32 +__smulq_383_n_shift_by_62: + mov $f0, @acc[8] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, $fx + sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor %rdx, $fx # conditionally negate |f0| (or |g0|) + add %rax, $fx + + xor %rdx, @acc[0] # conditionally negate |a| (or |b|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |a|*|f0| (or |b|*|g0|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___ if ($j==0); + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + lea 8*6($in_ptr), $in_ptr # pointer to |b| + mov %rdx, @acc[6] + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$62, @acc[1], @acc[0] + shrd \$62, @acc[2], @acc[1] + shrd \$62, @acc[3], @acc[2] + shrd \$62, @acc[4], @acc[3] + shrd \$62, @acc[5], @acc[4] + shrd \$62, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +___ +} } + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi"); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_62,\@abi-omnipotent +.align 32 +__ab_approximation_62: + mov 8*5($in_ptr), @a[2] # load |a| in reverse order + mov 8*11($in_ptr), @b[2] # load |b| in reverse order + mov 8*4($in_ptr), @a[1] + mov 8*10($in_ptr), @b[1] + mov 8*3($in_ptr), @a[0] + mov 8*9($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*2($in_ptr), @a[0] + mov 8*8($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... ones before top-most, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*1($in_ptr), @a[0] + mov 8*7($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*0($in_ptr), @a[0] + mov 8*6($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + jmp __inner_loop_62 + + ret +.size __ab_approximation_62,.-__ab_approximation_62 +___ +} +$code.=<<___; +.type __inner_loop_62,\@abi-omnipotent +.align 8 +.long 0 +__inner_loop_62: + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov \$1, $g1 # |g1|=1 + mov $in_ptr, 8(%rsp) + +.Loop_62: + xor $t0, $t0 + xor $t1, $t1 + test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t2 + mov $b_hi, $t3 + cmovnz $b_lo, $t0 + cmovnz $b_hi, $t1 + sub $a_lo, $t2 # |b_|-|a_| + sbb $a_hi, $t3 + mov $a_lo, $t4 + mov $a_hi, $t5 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + sbb $t1, $a_hi + cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t3, $a_hi + cmovc $t4, $b_lo # |b_| = |a_| + cmovc $t5, $b_hi + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shrd \$1, $a_hi, $a_lo + shr \$1, $a_hi + test \$1, $t4 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, $cnt + jnz .Loop_62 + + mov 8(%rsp), $in_ptr + ret +.size __inner_loop_62,.-__inner_loop_62 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl b/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl new file mode 100755 index 00000000000..d207e2f5a7c --- /dev/null +++ b/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl @@ -0,0 +1,995 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >4x better than +# modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulx_383_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulx_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edi"; + +$frame = 8*11+2*512; + +$code.=<<___; +.text + +.globl ctx_inverse_mod_383 +.type ctx_inverse_mod_383,\@function,4,"unwind" +.align 32 +ctx_inverse_mod_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov 8*0($n_ptr), @acc[6] # load modulus + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + mov 8*3($n_ptr), @acc[9] + mov 8*4($n_ptr), @acc[10] + mov 8*5($n_ptr), @acc[11] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov @acc[8], 8*8(%rax) + mov @acc[9], 8*9(%rax) + mov @acc[10], 8*10(%rax) + mov %rax, $in_ptr + mov @acc[11], 8*11(%rax) + + ################################# first iteration + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*7(%rsp) # corrected |f0| + #mov $g0, 8*8(%rsp) # corrected |g0| + mov $f0, 8*12($out_ptr) # initialize |u| with |f0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + mov $f0, 8*12($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulx_383_n_shift_by_31 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*12($in_ptr), %rax # |u| + mov 8*18($in_ptr), @acc[3] # |v| + mov $f0, %rbx + mov %rax, @acc[2] + imulq 8*7(%rsp) # |u|*|f0| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq 8*8(%rsp) # |v|*|g0| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*6($out_ptr) # destination |u| + mov @acc[1], 8*7($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*8($out_ptr) + mov @acc[1], 8*9($out_ptr) + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + lea 8*12($in_ptr), $in_ptr # make in_ptr "rewindable" with xor + + mov @acc[2], %rax + imulq %rbx # |u|*|f1| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq %rcx # |v|*|g1| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*12($out_ptr) # destination |v| + mov @acc[1], 8*13($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*14($out_ptr) + mov @acc[1], 8*15($out_ptr) + mov @acc[1], 8*16($out_ptr) + mov @acc[1], 8*17($out_ptr) +___ +for($i=2; $i<23; $i++) { +my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31" + : "__smulx_191_n_shift_by_31"; +my $smul_767x63 = $i>11 ? "__smulx_767x63" + : "__smulx_383x63"; +$code.=<<___; + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call $smul_n_shift + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call $smul_n_shift + mov $f0, 8*9(%rsp) # corrected |f1| + mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*7(%rsp), $f0 # |f0| + mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + call __smulx_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call $smul_767x63 +___ +$code.=<<___ if ($i==11); + sar \$63, @acc[5] # sign extension + mov @acc[5], 8*6($out_ptr) + mov @acc[5], 8*7($out_ptr) + mov @acc[5], 8*8($out_ptr) + mov @acc[5], 8*9($out_ptr) + mov @acc[5], 8*10($out_ptr) + mov @acc[5], 8*11($out_ptr) +___ +} +$code.=<<___; + ################################# two[!] last iterations in one go + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$53, $cnt # 31 + 766 % 31 + #call __ab_approximation_31 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + #xor @acc[1], @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + #xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + #mov $f1, 8*9(%rsp) + #mov $g1, 8*10(%rsp) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulx_383x63 + + #mov 8*9(%rsp), $f0 # |f1| + #mov 8*10(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original out_ptr + call __smulx_767x63 + + mov 8*5(%rsp), $in_ptr # original n_ptr + mov %rax, %rdx # top limb of the result + sar \$63, %rax # result's sign as mask + + mov %rax, @acc[0] # mask |modulus| + mov %rax, @acc[1] + mov %rax, @acc[2] + and 8*0($in_ptr), @acc[0] + and 8*1($in_ptr), @acc[1] + mov %rax, @acc[3] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), @acc[3] + mov %rax, @acc[4] + and 8*4($in_ptr), @acc[4] + and 8*5($in_ptr), %rax + + add @acc[0], @acc[6] # conditionally add |modulus|<<384 + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], %rcx + adc %rax, %rdx + + mov @acc[6], 8*6($out_ptr) # store absolute value + mov @acc[7], 8*7($out_ptr) + mov @acc[8], 8*8($out_ptr) + mov @acc[9], 8*9($out_ptr) + mov %rcx, 8*10($out_ptr) + mov %rdx, 8*11($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +___ +######################################################################## +# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers +# to the maximum bit-length of the *result*, and "63" - to the maximum +# bit-length of the |f?| and |g?| single-limb multiplicands. However! +# The latter should not be taken literally, as they are always chosen so +# that "bad things" don't happen. For example, there comes a point when +# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we +# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is +# because past that point |f0| is always 1 and |g0| is always 0. And, +# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to +# perform full-width |u|*|f1| multiplication, half-width one with sign +# extension is sufficient... +{ +my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); +my $fx = @acc[9]; + +$code.=<<___; +.type __smulx_767x63,\@abi-omnipotent +.align 32 +__smulx_767x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov $f0, %rax + sar \$63, %rax # |f0|'s sign as mask + xor $fx, $fx # overrides in_ptr + sub %rax, $fx # |f0|'s sign as bit + + mov $out_ptr, 8*1(%rsp) + mov $in_ptr, 8*2(%rsp) + lea 8*6($in_ptr), $in_ptr # pointer to |v| + + xor %rax, $f0 # conditionally negate |f0| + add $fx, $f0 + + xor %rax, @acc[0] # conditionally negate |u| + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor @acc[5], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |u|*|f0| + mulx @acc[1], @acc[1], @acc[5] + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___; + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc \$0, %rdx + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) + mov %rdx, 8*6($out_ptr) + sar \$63, %rdx # sign extension + mov %rdx, 8*7($out_ptr) +___ +{ +my $fx=$in_ptr; +$code.=<<___; + mov $g0, $f0 # load |g0| + mov $g0, %rax + + mov 8*0($in_ptr), @acc[0] # load |v| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + mov 8*6($in_ptr), @acc[6] + mov 8*7($in_ptr), @acc[7] + mov 8*8($in_ptr), @acc[8] + mov 8*9($in_ptr), @acc[9] + mov 8*10($in_ptr), @acc[10] + mov 8*11($in_ptr), @acc[11] + + sar \$63, %rax # |g0|'s sign as mask + xor $fx, $fx # overrides in_ptr + sub %rax, $fx # |g0|'s sign as bit + + xor %rax, $f0 # conditionally negate |g0| + add $fx, $f0 + + xor %rax, @acc[0] # conditionally negate |v| + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor %rax, @acc[5] + xor %rax, @acc[6] + xor %rax, @acc[7] + xor %rax, @acc[8] + xor %rax, @acc[9] + xor %rax, @acc[10] + xor %rax, @acc[11] + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + adc \$0, @acc[8] + adc \$0, @acc[9] + adc \$0, @acc[10] + adc \$0, @acc[11] + + mulx @acc[0], @acc[0], %rax # |v|*|g0| + mulx @acc[1], @acc[1], $fx + add %rax, @acc[1] +___ +for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___; + mulx @acc[11], @acc[11], $fx + mov 8*1(%rsp), %rdx # out_ptr + mov 8*2(%rsp), $in_ptr # restore original in_ptr + adc @acc[11], %rax + + add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| + adc 8*1(%rdx), @acc[1] + adc 8*2(%rdx), @acc[2] + adc 8*3(%rdx), @acc[3] + adc 8*4(%rdx), @acc[4] + adc 8*5(%rdx), @acc[5] + adc 8*6(%rdx), @acc[6] + mov 8*7(%rdx), @acc[11] # sign extension + adc @acc[11], @acc[7] + adc @acc[11], @acc[8] + adc @acc[11], @acc[9] + adc @acc[11], @acc[10] + adc @acc[11], %rax + + mov %rdx, $out_ptr # restore original out_ptr + + mov @acc[0], 8*0(%rdx) + mov @acc[1], 8*1(%rdx) + mov @acc[2], 8*2(%rdx) + mov @acc[3], 8*3(%rdx) + mov @acc[4], 8*4(%rdx) + mov @acc[5], 8*5(%rdx) + mov @acc[6], 8*6(%rdx) + mov @acc[7], 8*7(%rdx) + mov @acc[8], 8*8(%rdx) + mov @acc[9], 8*9(%rdx) + mov @acc[10], 8*10(%rdx) + mov %rax, 8*11(%rdx) + + ret +.size __smulx_767x63,.-__smulx_767x63 +___ +} +$code.=<<___; +.type __smulx_383x63,\@abi-omnipotent +.align 32 +__smulx_383x63: +___ +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), @acc[4] + mov $k+8*5($in_ptr), @acc[5] + + mov $f0, $fx + sar \$63, $fx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $fx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $fx, $f0 # conditionally negate |f0| + add %rax, $f0 + + xor $fx, @acc[0] # conditionally negate |u| (or |v|) + xor $fx, @acc[1] + xor $fx, @acc[2] + xor $fx, @acc[3] + xor $fx, @acc[4] + xor $fx, @acc[5] + add %rax, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulx @acc[0], @acc[0], $fx # |u|*|f0| (or |v|*|g0|) + mulx @acc[1], @acc[1], %rax + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___ if ($j==0); + mulx @acc[$i], @acc[$i], %rax + mov $g0, $f0 + adc $fx, @acc[$i] + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + mulx @acc[$i], @acc[$i], %rax + adc $fx, @acc[$i] + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulx_383x63,.-__smulx_383x63 +___ +######################################################################## +# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of +# the names refers to maximum bit-lengths of |a| and |b|. As already +# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always +# chosen so that "bad things" don't happen. For example, so that the +# sum of the products doesn't overflow, and that the final result is +# never wider than inputs... +{ +$code.=<<___; +.type __smulx_383_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulx_383_n_shift_by_31: + mov $f0, @acc[8] + xor @acc[6], @acc[6] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), @acc[4] + mov $k+8*5($in_ptr), @acc[5] + + mov %rdx, %rax + sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) + xor $fx, $fx + sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) + + xor %rax, %rdx # conditionally negate |f0| (or |g0|) + add $fx, %rdx + + xor %rax, @acc[0] # conditionally negate |a| (or |b|) + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor @acc[5], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) + mulx @acc[1], @acc[1], @acc[5] + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___ if ($j==0); + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc %rdx, @acc[6] + + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) +___ +} +$code.=<<___; + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc \$0, %rdx + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), %rax + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$31, @acc[1], @acc[0] + shrd \$31, @acc[2], @acc[1] + shrd \$31, @acc[3], @acc[2] + shrd \$31, @acc[4], @acc[3] + shrd \$31, %rax, @acc[4] + shrd \$31, @acc[6], %rax + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +___ +} { +$code.=<<___; +.type __smulx_191_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulx_191_n_shift_by_31: + mov $f0, @acc[8] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +my @acc=@acc; + @acc=@acc[3..5] if ($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + + mov %rdx, %rax + sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) + xor $fx, $fx + sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) + + xor %rax, %rdx # conditionally negate |f0| (or |g0|) + add $fx, %rdx + + xor %rax, @acc[0] # conditionally negate |a| (or |b|) + xor %rax, @acc[1] + xor @acc[2], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) + mulx @acc[1], @acc[1], @acc[2] + add $fx, @acc[1] + adc \$0, @acc[2] + imulq %rdx + add %rax, @acc[2] + adc \$0, %rdx +___ +$code.=<<___ if ($j==0); + mov %rdx, @acc[6] + mov $g0, %rdx +___ +} +$code.=<<___; + add @acc[0], @acc[3] + adc @acc[1], @acc[4] + adc @acc[2], @acc[5] + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$31, @acc[4], @acc[3] + shrd \$31, @acc[5], @acc[4] + shrd \$31, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[3] # conditionally negate the result + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add $fx, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[3], 8*0($out_ptr) + mov @acc[4], 8*1($out_ptr) + mov @acc[5], 8*2($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +___ +} } + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); +my ($a_, $b_) = ($a_lo, $b_lo); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_31,\@abi-omnipotent +.align 32 +__ab_approximation_31: + mov 8*5($in_ptr), @a[2] # load |a| in reverse order + mov 8*11($in_ptr), @b[2] # load |b| in reverse order + mov 8*4($in_ptr), @a[1] + mov 8*10($in_ptr), @b[1] + mov 8*3($in_ptr), @a[0] + mov 8*9($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*2($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*8($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... ones before top-most, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*1($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*7($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*0($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*6($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[2] + cmovz @b[0], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + mov \$0x7FFFFFFF, %eax + and %rax, @a[0] + and %rax, @b[0] + andn @a[2], %rax, @a[2] + andn @b[2], %rax, @b[2] + or @a[2], @a[0] + or @b[2], @b[0] + + jmp __inner_loop_31 + + ret +.size __ab_approximation_31,.-__ab_approximation_31 +___ +} +$code.=<<___; +.type __inner_loop_31,\@abi-omnipotent +.align 32 +__inner_loop_31: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + mov \$0x7FFFFFFF7FFFFFFF, $bias + +.Loop_31: + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + mov $fg0, $t2 + mov $fg1, $t3 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + + shr \$1, $a_ # |a_|>>=1 + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + sub $bias, $fg1 + sub \$1, $cnt + jnz .Loop_31 + + shr \$32, $bias + mov %ecx, %edx # $fg0, $f0 + mov ${fg1}d, ${f1}d + shr \$32, $g0 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_31,.-__inner_loop_31 + +.type __inner_loop_62,\@abi-omnipotent +.align 32 +__inner_loop_62: + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov \$1, $g1 # |g1|=1 + +.Loop_62: + xor $t0, $t0 + test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t1 + cmovnz $b_lo, $t0 + sub $a_lo, $t1 # |b_|-|a_| + mov $a_lo, $t2 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t2, $b_lo # |b_| = |a_| + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shr \$1, $a_lo + test \$1, $t2 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, $cnt + jnz .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/div3w-armv8.pl b/crypto/blst_src/asm/div3w-armv8.pl new file mode 100755 index 00000000000..bfa32453c3a --- /dev/null +++ b/crypto/blst_src/asm/div3w-armv8.pl @@ -0,0 +1,122 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$code.=<<___; +.text + +.globl div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +___ +{ +my ($div_rem, $divisor, $quot) = map("x$_",(0..2)); +my @div = map("x$_",(3..4)); +my @acc = map("x$_",(5..7)); +my @t = map("x$_",(8..11)); + +$code.=<<___; +.globl quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp @div[0],@div[1],[$divisor] + + mul @acc[0],@div[0],$quot // divisor[0:1} * quotient + umulh @acc[1],@div[0],$quot + mul @t[3], @div[1],$quot + umulh @acc[2],@div[1],$quot + + ldp @t[0],@t[1],[$div_rem] // load 3 limbs of the dividend + ldr @t[2],[$div_rem,#16] + + adds @acc[1],@acc[1],@t[3] + adc @acc[2],@acc[2],xzr + + subs @t[0],@t[0],@acc[0] // dividend - divisor * quotient + sbcs @t[1],@t[1],@acc[1] + sbcs @t[2],@t[2],@acc[2] + sbc @acc[0],xzr,xzr // borrow -> mask + + add $quot,$quot,@acc[0] // if borrowed, adjust the quotient ... + and @div[0],@div[0],@acc[0] + and @div[1],@div[1],@acc[0] + adds @t[0],@t[0],@div[0] // ... and add divisor + adc @t[1],@t[1],@div[1] + + stp @t[0],@t[1],[$div_rem] // save 2 limbs of the remainder + str $quot,[$div_rem,#16] // and one limb of the quotient + + mov x0,$quot // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr @div[0],[$divisor] + ldr @t[0],[$div_rem] // load 1 limb of the dividend + + mul @acc[0],@div[0],$quot // divisor * quotient + + sub @t[0],@t[0],@acc[0] // dividend - divisor * quotient + + stp @t[0],$quot,[$div_rem] // save remainder and quotient + + mov x0,$quot // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/div3w-x86_64.pl b/crypto/blst_src/asm/div3w-x86_64.pl new file mode 100755 index 00000000000..b8192db8e6d --- /dev/null +++ b/crypto/blst_src/asm/div3w-x86_64.pl @@ -0,0 +1,184 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$c_ref=<<'___'; +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi) +{ + llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0]; + llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo; + limb_t Q = 0, mask; + size_t i; + + for (i = 0; i < LIMB_BITS; i++) { + Q <<= 1; + mask = (R >= D); + Q |= mask; + R -= (D & ((llimb_t)0 - mask)); + D >>= 1; + } + + mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */ + + Q <<= 1; + Q |= (R >= D); + + return (Q | mask); +} +___ + +$code.=<<___; +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,\@function,3 +.align 32 +div_3_limbs: + mov (%rdi),%r8 # load R.lo + mov 8(%rdi),%r9 # load R.hi + xor %rax,%rax # Q = 0 + mov \$64,%ecx # loop counter + +.Loop: + mov %r8,%r10 # put aside R + sub %rsi,%r8 # R -= D + mov %r9,%r11 + sbb %rdx,%r9 + lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit + mov %rdx,%rdi + cmovc %r10,%r8 # restore R if R - D borrowed + cmovc %r11,%r9 + sbb \$0,%rax # subtract speculative bit + shl \$63,%rdi + shr \$1,%rsi + shr \$1,%rdx + or %rdi,%rsi # D >>= 1 + sub \$1,%ecx + jnz .Loop + + lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit + sar \$63,%rax # top bit -> mask + + sub %rsi,%r8 # R -= D + sbb %rdx,%r9 + sbb \$0,%rcx # subtract speculative bit + + or %rcx,%rax # all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +___ +######################################################################## +# Calculate remainder and adjust the quotient, which can be off-by-one. +# Then save quotient in limb next to top limb of the remainder. There is +# place, because the remainder/next-iteration-dividend gets shorter by +# one limb. +{ +my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx"); +my @acc = ("%r8", "%r9", "%rdx"); +my @tmp = ("%r10", "%r11", "%rax"); + +$code.=<<___; +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,\@function,3 +.align 32 +quot_rem_128: + mov %rdx, %rax + mov %rdx, $quotient + + mulq 0($divisor) # divisor[0:1] * quotient + mov %rax, @acc[0] + mov $quotient, %rax + mov %rdx, @acc[1] + + mulq 8($divisor) + add %rax, @acc[1] + adc \$0, %rdx # %rdx is @acc[2] + + mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend + mov 8($div_rem), @tmp[1] + mov 16($div_rem), @tmp[2] + + sub @acc[0], @tmp[0] # dividend - divisor * quotient + sbb @acc[1], @tmp[1] + sbb @acc[2], @tmp[2] + sbb @acc[0], @acc[0] # borrow -> mask + + add @acc[0], $quotient # if borrowed, adjust the quotient ... + mov @acc[0], @acc[1] + and 0($divisor), @acc[0] + and 8($divisor), @acc[1] + add @acc[0], @tmp[0] # ... and add divisor + adc @acc[1], @tmp[1] + + mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ... + mov @tmp[1], 8($div_rem) + mov $quotient, 16($div_rem) # ... and 1 limb of the quotient + + mov $quotient, %rax # return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +######################################################################## +# Unlike 128-bit case above, quotient is exact. As result just one limb +# of the dividend is sufficient to calculate the remainder... + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,\@function,3 +.align 32 +quot_rem_64: + mov %rdx, %rax # return quotient + imulq 0($divisor), %rdx # divisor[0] * quotient + + mov 0($div_rem), @tmp[0] # load 1 limb of the dividend + + sub %rdx, @tmp[0] # dividend - divisor * quotient + + mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ... + mov %rax, 8($div_rem) # ... and 1 limb of the quotient + + ret +.size quot_rem_64,.-quot_rem_64 +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/mul_mont_256-armv8.pl b/crypto/blst_src/asm/mul_mont_256-armv8.pl new file mode 100755 index 00000000000..ba6c2b87980 --- /dev/null +++ b/crypto/blst_src/asm/mul_mont_256-armv8.pl @@ -0,0 +1,409 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# As for "sparse" in subroutine names, see commentary in the +# asm/mulx_mont_256-x86_64.pl module. + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); + +@mod=map("x$_",(5..8)); +$bi="x9"; +@a=map("x$_",(10..13)); +@tmp=map("x$_",(14..17)); +@acc=map("x$_",(19..24)); +$m0=$n_ptr; + +$code.=<<___; +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + mul @acc[0],@a[0],$bi + ldp @mod[0],@mod[1],[$n_ptr] + mul @acc[1],@a[1],$bi + ldp @mod[2],@mod[3],[$n_ptr,#16] + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + mul $m0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + adds @acc[1],@acc[1],@tmp[0] + //mul @tmp[0],@mod[0],$m0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$m0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$m0 + adc @acc[4],xzr, @tmp[3] + mul @tmp[3],@mod[3],$m0 +___ +for ($i=1;$i<4;$i++) { +$code.=<<___; + ldr $bi,[$b_ptr,8*$i] + subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @acc[4],@acc[4],xzr + + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adc @acc[4],xzr,xzr + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $m0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adc @acc[4],@acc[4],xzr + + adds @acc[1],@acc[1],@tmp[0] + //mul @tmp[0],@mod[0],$m0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$m0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$m0 + adc @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$m0 +___ +} +$code.=<<___; + subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @acc[4],@acc[4],xzr + + adds @acc[0],@acc[1],@tmp[0] + adcs @acc[1],@acc[2],@tmp[1] + adcs @acc[2],@acc[3],@tmp[2] + adcs @acc[3],@acc[4],@tmp[3] + adc @acc[4],xzr,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs xzr, @acc[4],xzr + + csel @acc[0],@acc[0],@tmp[0],lo + csel @acc[1],@acc[1],@tmp[1],lo + csel @acc[2],@acc[2],@tmp[2],lo + csel @acc[3],@acc[3],@tmp[3],lo + + stp @acc[0],@acc[1],[$r_ptr] + stp @acc[2],@acc[3],[$r_ptr,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +___ +{ +my @acc = (@a,@acc[0..3]); +my @a = @mod; + +$code.=<<___; +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + mov $n0,$n_ptr + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x] + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul @acc[1],@a[1],@a[0] // a[1]*a[0] + umulh @tmp[1],@a[1],@a[0] + mul @acc[2],@a[2],@a[0] // a[2]*a[0] + umulh @tmp[2],@a[2],@a[0] + mul @acc[3],@a[3],@a[0] // a[3]*a[0] + umulh @acc[4],@a[3],@a[0] + + adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication + mul @tmp[0],@a[2],@a[1] // a[2]*a[1] + umulh @tmp[1],@a[2],@a[1] + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@a[3],@a[1] // a[3]*a[1] + umulh @tmp[3],@a[3],@a[1] + adc @acc[4],@acc[4],xzr // can't overflow + + mul @acc[5],@a[3],@a[2] // a[3]*a[2] + umulh @acc[6],@a[3],@a[2] + + adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication + mul @acc[0],@a[0],@a[0] // a[0]*a[0] + adc @tmp[2],@tmp[3],xzr // can't overflow + + adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication + umulh @a[0],@a[0],@a[0] + adcs @acc[4],@acc[4],@tmp[1] + mul @tmp[1],@a[1],@a[1] // a[1]*a[1] + adcs @acc[5],@acc[5],@tmp[2] + umulh @a[1],@a[1],@a[1] + adc @acc[6],@acc[6],xzr // can't overflow + + adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2 + mul @tmp[2],@a[2],@a[2] // a[2]*a[2] + adcs @acc[2],@acc[2],@acc[2] + umulh @a[2],@a[2],@a[2] + adcs @acc[3],@acc[3],@acc[3] + mul @tmp[3],@a[3],@a[3] // a[3]*a[3] + adcs @acc[4],@acc[4],@acc[4] + umulh @a[3],@a[3],@a[3] + adcs @acc[5],@acc[5],@acc[5] + adcs @acc[6],@acc[6],@acc[6] + adc @acc[7],xzr,xzr + + adds @acc[1],@acc[1],@a[0] // +a[i]*a[i] + adcs @acc[2],@acc[2],@tmp[1] + adcs @acc[3],@acc[3],@a[1] + adcs @acc[4],@acc[4],@tmp[2] + adcs @acc[5],@acc[5],@a[2] + adcs @acc[6],@acc[6],@tmp[3] + adc @acc[7],@acc[7],@a[3] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds @acc[0],@acc[0],@acc[4] // accumulate upper half + adcs @acc[1],@acc[1],@acc[5] + adcs @acc[2],@acc[2],@acc[6] + adcs @acc[3],@acc[3],@acc[7] + adc @acc[4],xzr,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs xzr, @acc[4],xzr + + csel @acc[0],@acc[0],@tmp[0],lo + csel @acc[1],@acc[1],@tmp[1],lo + csel @acc[2],@acc[2],@tmp[2],lo + csel @acc[3],@acc[3],@tmp[3],lo + + stp @acc[0],@acc[1],[$r_ptr] + stp @acc[2],@acc[3],[$r_ptr,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +___ +} +{ +my @a = (@a, $bi); + +$code.=<<___; +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $n0,$n_ptr + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs @tmp[0],@a[0],@mod[0] + sbcs @tmp[1],@a[1],@mod[1] + sbcs @tmp[2],@a[2],@mod[2] + sbcs @tmp[3],@a[3],@mod[3] + + csel @a[0],@a[0],@tmp[0],lo + csel @a[1],@a[1],@tmp[1],lo + csel @a[2],@a[2],@tmp[2],lo + csel @a[3],@a[3],@tmp[3],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ldr x29,[sp],#16 + autiasp + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $n0,$n_ptr + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp @tmp[0],@tmp[1],[$a_ptr,#32] + ldp @tmp[2],@tmp[3],[$a_ptr,#48] + + adds @a[0],@a[0],@tmp[0] + adcs @a[1],@a[1],@tmp[1] + adcs @a[2],@a[2],@tmp[2] + adcs @a[3],@a[3],@tmp[3] + adc @a[4],xzr,xzr + + subs @tmp[0],@a[0],@mod[0] + sbcs @tmp[1],@a[1],@mod[1] + sbcs @tmp[2],@a[2],@mod[2] + sbcs @tmp[3],@a[3],@mod[3] + sbcs xzr, @a[4],xzr + + csel @a[0],@a[0],@tmp[0],lo + csel @a[1],@a[1],@tmp[1],lo + csel @a[2],@a[2],@tmp[2],lo + csel @a[3],@a[3],@tmp[3],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ldr x29,[sp],#16 + autiasp + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul $m0,$n0,@a[0] + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] +___ +for ($i=1;$i<4;$i++) { +$code.=<<___; + //mul @tmp[0],@mod[0],$m0 + mul @tmp[1],@mod[1],$m0 + mul @tmp[2],@mod[2],$m0 + mul @tmp[3],@mod[3],$m0 + subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @a[1],@a[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @a[2],@a[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @a[3],@a[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @a[4],xzr,xzr + + adds @a[0],@a[1],@tmp[0] + adcs @a[1],@a[2],@tmp[1] + adcs @a[2],@a[3],@tmp[2] + mul $m0,$n0,@a[0] + adc @a[3],@a[4],@tmp[3] +___ +} +$code.=<<___; + //mul @tmp[0],@mod[0],$m0 + mul @tmp[1],@mod[1],$m0 + mul @tmp[2],@mod[2],$m0 + mul @tmp[3],@mod[3],$m0 + subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @a[1],@a[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @a[2],@a[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @a[3],@a[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @a[4],xzr,xzr + + adds @a[0],@a[1],@tmp[0] + adcs @a[1],@a[2],@tmp[1] + adcs @a[2],@a[3],@tmp[2] + adc @a[3],@a[4],@tmp[3] + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 +___ +} + +print $code; + +close STDOUT; diff --git a/crypto/blst_src/asm/mul_mont_384-armv8.pl b/crypto/blst_src/asm/mul_mont_384-armv8.pl new file mode 100755 index 00000000000..44e12a00b03 --- /dev/null +++ b/crypto/blst_src/asm/mul_mont_384-armv8.pl @@ -0,0 +1,2015 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); + +@mod = map("x$_",(5..10)); +@a = map("x$_",(11..16)); +$bi = "x17"; +@acc = map("x$_",(19..25)); +@tmp = map("x$_",(26..28,0,1,3)); + +$code.=<<___; +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + paciasp + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + autiasp + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + adds @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + adcs @a[3],@a[3],@acc[3] + stp @a[0], @a[1], [$r_ptr] + adcs @a[4],@a[4],@acc[4] + ldp @a[0], @a[1], [$a_ptr,#48] + adcs @a[5],@a[5],@acc[5] + + ldp @acc[0],@acc[1],[$b_ptr,#48] + stp @a[2], @a[3], [$r_ptr,#16] + ldp @a[2], @a[3], [$a_ptr,#64] + ldp @acc[2],@acc[3],[$b_ptr,#64] + + adcs @a[0],@a[0],@acc[0] + stp @a[4], @a[5], [$r_ptr,#32] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#80] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#80] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc $bi,xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,$bi,xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + stp @a[0],@a[1],[$r_ptr,#48] + csel @a[4],@a[4],@acc[4],lo + stp @a[2],@a[3],[$r_ptr,#64] + csel @a[5],@a[5],@acc[5],lo + stp @a[4],@a[5],[$r_ptr,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + paciasp + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + autiasp + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + subs @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + sbcs @a[3],@a[3],@acc[3] + stp @a[0], @a[1], [$r_ptr] + sbcs @a[4],@a[4],@acc[4] + ldp @a[0], @a[1], [$a_ptr,#48] + sbcs @a[5],@a[5],@acc[5] + + ldp @acc[0],@acc[1],[$b_ptr,#48] + stp @a[2], @a[3], [$r_ptr,#16] + ldp @a[2], @a[3], [$a_ptr,#64] + ldp @acc[2],@acc[3],[$b_ptr,#64] + + sbcs @a[0],@a[0],@acc[0] + stp @a[4], @a[5], [$r_ptr,#32] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#80] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#80] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc $bi,xzr,xzr + + and @acc[0],@mod[0],$bi + and @acc[1],@mod[1],$bi + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],$bi + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],$bi + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],$bi + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],$bi + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr,#48] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + adds @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc $bi,xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,$bi,xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[5],@a[5],@acc[5],lo + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + subs @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc $bi,xzr,xzr + + and @acc[0],@mod[0],$bi + and @acc[1],@mod[1],$bi + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],$bi + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],$bi + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],$bi + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],$bi + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov @tmp[0],$r_ptr // save r_ptr + mov @tmp[1],$a_ptr // save b_ptr + mov @tmp[2],$b_ptr // save b_ptr + + sub $r_ptr,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add $a_ptr,$a_ptr,#48 // mul_384(t1, a->im, b->im) + add $b_ptr,$b_ptr,#48 + add $r_ptr,sp,#96 + bl __mul_384 + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + sub $b_ptr,$a_ptr,#48 + add $r_ptr,sp,#240 + bl __add_mod_384 + + add $a_ptr,@tmp[2],#0 + add $b_ptr,@tmp[2],#48 + add $r_ptr,sp,#192 // t2 + bl __add_mod_384 + + add $a_ptr,$r_ptr,#0 + add $b_ptr,$r_ptr,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + mov $a_ptr,$r_ptr + add $b_ptr,sp,#0 + bl __sub_mod_384x384 + + add $b_ptr,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add $a_ptr,sp,#0 + add $b_ptr,sp,#96 + add $r_ptr,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add $a_ptr,sp,#0 // ret->re = redc(t0) + add $r_ptr,@tmp[0],#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add $a_ptr,sp,#192 // ret->im = redc(t2) + add $r_ptr,$r_ptr,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + add $b_ptr,$a_ptr,#48 + add $r_ptr,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add $r_ptr,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds @a[0],@a[0],@a[0] // add with itself + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @acc[6],xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,@acc[6],xzr + + csel @acc[0],@a[0],@acc[0],lo + csel @acc[1],@a[1],@acc[1],lo + csel @acc[2],@a[2],@acc[2],lo + ldp @a[0],@a[1],[sp] + csel @acc[3],@a[3],@acc[3],lo + ldr $bi, [sp,#48] + csel @acc[4],@a[4],@acc[4],lo + ldp @a[2],@a[3],[sp,#16] + csel @acc[5],@a[5],@acc[5],lo + ldp @a[4],@a[5],[sp,#32] + + stp @acc[0],@acc[1],[$b_ptr,#48] + stp @acc[2],@acc[3],[$b_ptr,#64] + stp @acc[4],@acc[5],[$b_ptr,#80] + + add $b_ptr,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + mul $n0,$n0,@acc[0] + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + umulh @tmp[4],@a[4],$bi + umulh @tmp[5],@a[5],$bi + + adds @acc[1],@acc[1],@tmp[0] + // mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],xzr, @tmp[5] + mul @tmp[5],@mod[5],$n0 + mov $bi,xzr +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adcs @acc[6],@acc[6],xzr + adc $n0,$bi,xzr + ldr $bi,[$b_ptr,8*$i] + + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adcs @acc[4],@acc[5],@tmp[4] + mul @tmp[4],@a[4],$bi + adcs @acc[5],@acc[6],@tmp[5] + mul @tmp[5],@a[5],$bi + adc @acc[6],$n0,xzr + ldr $n0,[x29,#96] + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $n0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@a[4],$bi + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@a[5],$bi + adcs @acc[6],@acc[6],xzr + adc $bi,xzr,xzr + + adds @acc[1],@acc[1],@tmp[0] + // mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adcs @acc[6],@acc[6],@tmp[5] + mul @tmp[5],@mod[5],$n0 + adc $bi,$bi,xzr +___ +} +$code.=<<___; + subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adcs @acc[6],@acc[6],xzr + ldp $n0,$b_ptr,[x29,#96] // pull r_ptr + adc $bi,$bi,xzr + + adds @acc[0],@acc[1],@tmp[0] + adcs @acc[1],@acc[2],@tmp[1] + adcs @acc[2],@acc[3],@tmp[2] + adcs @acc[3],@acc[4],@tmp[3] + adcs @acc[4],@acc[5],@tmp[4] + adcs @acc[5],@acc[6],@tmp[5] + adc @acc[6],$bi,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs @tmp[4],@acc[4],@mod[4] + sbcs @tmp[5],@acc[5],@mod[5] + sbcs xzr, @acc[6],xzr + + csel @a[0],@acc[0],@tmp[0],lo + csel @a[1],@acc[1],@tmp[1],lo + csel @a[2],@acc[2],@tmp[2],lo + csel @a[3],@acc[3],@tmp[3],lo + csel @a[4],@acc[4],@tmp[4],lo + csel @a[5],@acc[5],@tmp[5],lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov $n0,$n_ptr // adjust for missing b_ptr + + mov $n_ptr,$r_ptr // save r_ptr + mov $r_ptr,sp + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __sqr_384 + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + mov $a_ptr,sp + mov $r_ptr,$n_ptr // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov $bi,x5 // save b_ptr + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + mov $r_ptr,sp +.Loop_sqr_383: + bl __sqr_384 + sub $b_ptr,$b_ptr,#1 // counter + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + mov $a_ptr,sp + bl __mul_by_1_mont_384 + + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @a[0],@a[0],@acc[0] // just accumulate upper half + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adc @a[5],@a[5],@acc[5] + + cbnz $b_ptr,.Loop_sqr_383 + + mov $b_ptr,$bi + ldr $bi,[$bi] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ +my @acc=(@acc,@tmp[0..2]); + +$code.=<<___; +.type __sqr_384,%function +.align 5 +__sqr_384: + mul @acc[0],@a[1],@a[0] + mul @acc[1],@a[2],@a[0] + mul @acc[2],@a[3],@a[0] + mul @acc[3],@a[4],@a[0] + mul @acc[4],@a[5],@a[0] + + umulh @mod[1],@a[1],@a[0] + umulh @mod[2],@a[2],@a[0] + umulh @mod[3],@a[3],@a[0] + umulh @mod[4],@a[4],@a[0] + adds @acc[1],@acc[1],@mod[1] + umulh @mod[5],@a[5],@a[0] + adcs @acc[2],@acc[2],@mod[2] + mul @mod[2],@a[2],@a[1] + adcs @acc[3],@acc[3],@mod[3] + mul @mod[3],@a[3],@a[1] + adcs @acc[4],@acc[4],@mod[4] + mul @mod[4],@a[4],@a[1] + adc @acc[5],xzr, @mod[5] + mul @mod[5],@a[5],@a[1] + + adds @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],@a[1] + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],@a[1] + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],@a[1] + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],@a[1] + adc @acc[6],xzr,xzr + + mul @mod[0],@a[0],@a[0] + adds @acc[3],@acc[3],@mod[2] + umulh @a[0], @a[0],@a[0] + adcs @acc[4],@acc[4],@mod[3] + mul @mod[3],@a[3],@a[2] + adcs @acc[5],@acc[5],@mod[4] + mul @mod[4],@a[4],@a[2] + adc @acc[6],@acc[6],@mod[5] + mul @mod[5],@a[5],@a[2] + + adds @acc[4],@acc[4],@mod[3] + umulh @mod[3],@a[3],@a[2] + adcs @acc[5],@acc[5],@mod[4] + umulh @mod[4],@a[4],@a[2] + adcs @acc[6],@acc[6],@mod[5] + umulh @mod[5],@a[5],@a[2] + adc @acc[7],xzr,xzr + + mul @mod[1],@a[1],@a[1] + adds @acc[5],@acc[5],@mod[3] + umulh @a[1], @a[1],@a[1] + adcs @acc[6],@acc[6],@mod[4] + mul @mod[4],@a[4],@a[3] + adc @acc[7],@acc[7],@mod[5] + mul @mod[5],@a[5],@a[3] + + adds @acc[6],@acc[6],@mod[4] + umulh @mod[4],@a[4],@a[3] + adcs @acc[7],@acc[7],@mod[5] + umulh @mod[5],@a[5],@a[3] + adc @acc[8],xzr,xzr + mul @mod[2],@a[2],@a[2] + adds @acc[7],@acc[7],@mod[4] + umulh @a[2], @a[2],@a[2] + adc @acc[8],@acc[8],@mod[5] + mul @mod[3],@a[3],@a[3] + + mul @mod[5],@a[5],@a[4] + umulh @a[3], @a[3],@a[3] + adds @acc[8],@acc[8],@mod[5] + umulh @mod[5],@a[5],@a[4] + mul @mod[4],@a[4],@a[4] + adc @acc[9],@mod[5],xzr + + adds @acc[0],@acc[0],@acc[0] + adcs @acc[1],@acc[1],@acc[1] + adcs @acc[2],@acc[2],@acc[2] + adcs @acc[3],@acc[3],@acc[3] + adcs @acc[4],@acc[4],@acc[4] + adcs @acc[5],@acc[5],@acc[5] + adcs @acc[6],@acc[6],@acc[6] + adcs @acc[7],@acc[7],@acc[7] + umulh @a[4], @a[4],@a[4] + adcs @acc[8],@acc[8],@acc[8] + mul @mod[5],@a[5],@a[5] + adcs @acc[9],@acc[9],@acc[9] + umulh @a[5], @a[5],@a[5] + adc $a_ptr,xzr,xzr + + adds @acc[0],@acc[0],@a[0] + adcs @acc[1],@acc[1],@mod[1] + adcs @acc[2],@acc[2],@a[1] + adcs @acc[3],@acc[3],@mod[2] + adcs @acc[4],@acc[4],@a[2] + adcs @acc[5],@acc[5],@mod[3] + adcs @acc[6],@acc[6],@a[3] + stp @mod[0],@acc[0],[$r_ptr] + adcs @acc[7],@acc[7],@mod[4] + stp @acc[1],@acc[2],[$r_ptr,#16] + adcs @acc[8],@acc[8],@a[4] + stp @acc[3],@acc[4],[$r_ptr,#32] + adcs @acc[9],@acc[9],@mod[5] + stp @acc[5],@acc[6],[$r_ptr,#48] + adc @a[5],@a[5],$a_ptr + stp @acc[7],@acc[8],[$r_ptr,#64] + stp @acc[9],@a[5],[$r_ptr,#80] + + ret +.size __sqr_384,.-__sqr_384 +___ +} +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + csel @a[5],@a[5],@acc[5],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + mul @tmp[0],$n0,@a[0] + ldp @a[4],@a[5],[$a_ptr,#32] + + // mul @acc[0],@mod[0],@tmp[0] + mul @acc[1],@mod[1],@tmp[0] + mul @acc[2],@mod[2],@tmp[0] + mul @acc[3],@mod[3],@tmp[0] + mul @acc[4],@mod[4],@tmp[0] + mul @acc[5],@mod[5],@tmp[0] + subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] + umulh @a[0],@mod[0],@tmp[0] + adcs @acc[1],@acc[1],@a[1] + umulh @a[1],@mod[1],@tmp[0] + adcs @acc[2],@acc[2],@a[2] + umulh @a[2],@mod[2],@tmp[0] + adcs @acc[3],@acc[3],@a[3] + umulh @a[3],@mod[3],@tmp[0] + adcs @acc[4],@acc[4],@a[4] + umulh @a[4],@mod[4],@tmp[0] + adcs @acc[5],@acc[5],@a[5] + umulh @a[5],@mod[5],@tmp[0] + adc @acc[6],xzr,xzr +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + adds @a[0],@a[0],@acc[1] + adcs @a[1],@a[1],@acc[2] + adcs @a[2],@a[2],@acc[3] + mul @tmp[0],$n0,@a[0] + adcs @a[3],@a[3],@acc[4] + adcs @a[4],@a[4],@acc[5] + adc @a[5],@a[5],@acc[6] + + // mul @acc[0],@mod[0],@tmp[0] + mul @acc[1],@mod[1],@tmp[0] + mul @acc[2],@mod[2],@tmp[0] + mul @acc[3],@mod[3],@tmp[0] + mul @acc[4],@mod[4],@tmp[0] + mul @acc[5],@mod[5],@tmp[0] + subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] + umulh @a[0],@mod[0],@tmp[0] + adcs @acc[1],@acc[1],@a[1] + umulh @a[1],@mod[1],@tmp[0] + adcs @acc[2],@acc[2],@a[2] + umulh @a[2],@mod[2],@tmp[0] + adcs @acc[3],@acc[3],@a[3] + umulh @a[3],@mod[3],@tmp[0] + adcs @acc[4],@acc[4],@a[4] + umulh @a[4],@mod[4],@tmp[0] + adcs @acc[5],@acc[5],@a[5] + umulh @a[5],@mod[5],@tmp[0] + adc @acc[6],xzr,xzr +___ +} +$code.=<<___; + adds @a[0],@a[0],@acc[1] + adcs @a[1],@a[1],@acc[2] + adcs @a[2],@a[2],@acc[3] + adcs @a[3],@a[3],@acc[4] + adcs @a[4],@a[4],@acc[5] + adc @a[5],@a[5],@acc[6] + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @a[0],@a[0],@acc[0] // accumulate upper half + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc @acc[6],xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,@acc[6],xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + csel @a[5],@a[5],@acc[5],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + + umulh @mod[0],@a[0],$bi + umulh @mod[1],@a[1],$bi + umulh @mod[2],@a[2],$bi + umulh @mod[3],@a[3],$bi + umulh @mod[4],@a[4],$bi + umulh @mod[5],@a[5],$bi + ldr $bi,[$b_ptr,8*1] + + str @acc[0],[$r_ptr] + adds @acc[0],@acc[1],@mod[0] + mul @mod[0],@a[0],$bi + adcs @acc[1],@acc[2],@mod[1] + mul @mod[1],@a[1],$bi + adcs @acc[2],@acc[3],@mod[2] + mul @mod[2],@a[2],$bi + adcs @acc[3],@acc[4],@mod[3] + mul @mod[3],@a[3],$bi + adcs @acc[4],@acc[5],@mod[4] + mul @mod[4],@a[4],$bi + adc @acc[5],xzr, @mod[5] + mul @mod[5],@a[5],$bi +___ +for ($i=1;$i<5;$i++) { +$code.=<<___; + adds @acc[0],@acc[0],@mod[0] + umulh @mod[0],@a[0],$bi + adcs @acc[1],@acc[1],@mod[1] + umulh @mod[1],@a[1],$bi + adcs @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],$bi + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],$bi + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],$bi + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],$bi + ldr $bi,[$b_ptr,#8*($i+1)] + adc @acc[6],xzr,xzr + + str @acc[0],[$r_ptr,8*$i] + adds @acc[0],@acc[1],@mod[0] + mul @mod[0],@a[0],$bi + adcs @acc[1],@acc[2],@mod[1] + mul @mod[1],@a[1],$bi + adcs @acc[2],@acc[3],@mod[2] + mul @mod[2],@a[2],$bi + adcs @acc[3],@acc[4],@mod[3] + mul @mod[3],@a[3],$bi + adcs @acc[4],@acc[5],@mod[4] + mul @mod[4],@a[4],$bi + adc @acc[5],@acc[6],@mod[5] + mul @mod[5],@a[5],$bi +___ +} +$code.=<<___; + adds @acc[0],@acc[0],@mod[0] + umulh @mod[0],@a[0],$bi + adcs @acc[1],@acc[1],@mod[1] + umulh @mod[1],@a[1],$bi + adcs @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],$bi + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],$bi + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],$bi + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],$bi + adc @acc[6],xzr,xzr + + str @acc[0],[$r_ptr,8*$i] + adds @acc[0],@acc[1],@mod[0] + adcs @acc[1],@acc[2],@mod[1] + adcs @acc[2],@acc[3],@mod[2] + adcs @acc[3],@acc[4],@mod[3] + adcs @acc[4],@acc[5],@mod[4] + adc @acc[5],@acc[6],@mod[5] + + stp @acc[0],@acc[1],[$r_ptr,#48] + stp @acc[2],@acc[3],[$r_ptr,#64] + stp @acc[4],@acc[5],[$r_ptr,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp @a[0],@a[1],[$a_ptr] + mov @tmp[0],$r_ptr // save r_ptr + ldp @acc[0],@acc[1],[$a_ptr,#48] + mov @tmp[1],$a_ptr // save a_ptr + ldp @a[2],@a[3],[$a_ptr,#16] + mov @tmp[2],$b_ptr // save b_ptr + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @a[4],@a[5],[$a_ptr,#32] + adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im + ldp @acc[4],@acc[5],[$a_ptr,#80] + adcs @mod[1],$a[1],@acc[1] + ldp @a[0],@a[1],[$b_ptr] + adcs @mod[2],$a[2],@acc[2] + ldp @acc[0],@acc[1],[$b_ptr,#48] + adcs @mod[3],$a[3],@acc[3] + ldp @a[2],@a[3],[$b_ptr,#16] + adcs @mod[4],$a[4],@acc[4] + ldp @acc[2],@acc[3],[$b_ptr,#64] + adc @mod[5],$a[5],@acc[5] + ldp @a[4],@a[5],[$b_ptr,#32] + + stp @mod[0],@mod[1],[sp] + adds @mod[0],$a[0],@acc[0] // t1 = b->re + b->im + ldp @acc[4],@acc[5],[$b_ptr,#80] + adcs @mod[1],$a[1],@acc[1] + stp @mod[2],@mod[3],[sp,#16] + adcs @mod[2],$a[2],@acc[2] + adcs @mod[3],$a[3],@acc[3] + stp @mod[4],@mod[5],[sp,#32] + adcs @mod[4],$a[4],@acc[4] + stp @mod[0],@mod[1],[sp,#48] + adc @mod[5],$a[5],@acc[5] + stp @mod[2],@mod[3],[sp,#64] + stp @mod[4],@mod[5],[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add $a_ptr,sp,#0 // mul_384(ret->im, t0, t1) + add $b_ptr,sp,#48 + add $r_ptr,@tmp[0],#96 + bl __mul_384 + + add $a_ptr,@tmp[1],#48 // mul_384(tx, a->im, b->im) + add $b_ptr,@tmp[2],#48 + add $r_ptr,sp,#0 + bl __mul_384 + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + add $a_ptr,@tmp[0],#96 // ret->im -= tx + add $b_ptr,sp,#0 + add $r_ptr,@tmp[0],#96 + bl __sub_mod_384x384 + + add $b_ptr,@tmp[0],#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add $a_ptr,@tmp[0],#0 // ret->re -= tx + add $b_ptr,sp,#0 + add $r_ptr,@tmp[0],#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp @a[0],@a[1],[$a_ptr] + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @a[2],@a[3],[$a_ptr,#16] + adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im + ldp @acc[2],@acc[3],[$a_ptr,#64] + adcs @mod[1],$a[1],@acc[1] + ldp @a[4],@a[5],[$a_ptr,#32] + adcs @mod[2],$a[2],@acc[2] + ldp @acc[4],@acc[5],[$a_ptr,#80] + adcs @mod[3],$a[3],@acc[3] + stp @mod[0],@mod[1],[$r_ptr] + adcs @mod[4],$a[4],@acc[4] + ldp @mod[0],@mod[1],[$b_ptr] + adc @mod[5],$a[5],@acc[5] + stp @mod[2],@mod[3],[$r_ptr,#16] + + subs @a[0],$a[0],@acc[0] // t1 = a->re - a->im + ldp @mod[2],@mod[3],[$b_ptr,#16] + sbcs @a[1],$a[1],@acc[1] + stp @mod[4],@mod[5],[$r_ptr,#32] + sbcs @a[2],$a[2],@acc[2] + ldp @mod[4],@mod[5],[$b_ptr,#32] + sbcs @a[3],$a[3],@acc[3] + sbcs @a[4],$a[4],@acc[4] + sbcs @a[5],$a[5],@acc[5] + sbc @acc[6],xzr,xzr + + and @acc[0],@mod[0],@acc[6] + and @acc[1],@mod[1],@acc[6] + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],@acc[6] + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],@acc[6] + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],@acc[6] + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],@acc[6] + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr,#48] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + mov $n0,$a_ptr // save a_ptr + add $a_ptr,$r_ptr,#0 // mul_384(ret->re, t0, t1) + add $b_ptr,$r_ptr,#48 + bl __mul_384 + + add $a_ptr,$n0,#0 // mul_384(ret->im, a->re, a->im) + add $b_ptr,$n0,#48 + add $r_ptr,$r_ptr,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + adds @a[0],@a[0],@a[0] // add with itself + ldp @a[4],@a[5],[$r_ptr,#32] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adcs @acc[0],@acc[0],@acc[0] + adcs @acc[1],@acc[1],@acc[1] + stp @a[0],@a[1],[$r_ptr] + adcs @acc[2],@acc[2],@acc[2] + stp @a[2],@a[3],[$r_ptr,#16] + adcs @acc[3],@acc[3],@acc[3] + stp @a[4],@a[5],[$r_ptr,#32] + adcs @acc[4],@acc[4],@acc[4] + stp @acc[0],@acc[1],[$r_ptr,#48] + adc @acc[5],@acc[5],@acc[5] + stp @acc[2],@acc[3],[$r_ptr,#64] + stp @acc[4],@acc[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp $bi,@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @mod[0],$a[0],$bi // t0 = a->re + a->im + adcs @mod[1],$a[1],@acc[1] + adcs @mod[2],$a[2],@acc[2] + adcs @mod[3],$a[3],@acc[3] + adcs @mod[4],$a[4],@acc[4] + adc @mod[5],$a[5],@acc[5] + + subs @acc[0],$a[0],$bi // t1 = a->re - a->im + sbcs @acc[1],$a[1],@acc[1] + sbcs @acc[2],$a[2],@acc[2] + sbcs @acc[3],$a[3],@acc[3] + sbcs @acc[4],$a[4],@acc[4] + sbcs @acc[5],$a[5],@acc[5] + sbc @acc[6],xzr,xzr // borrow flag as mask + + stp @mod[0],@mod[1],[sp] + stp @mod[2],@mod[3],[sp,#16] + stp @mod[4],@mod[5],[sp,#32] + stp @acc[0],@acc[1],[sp,#48] + stp @acc[2],@acc[3],[sp,#64] + stp @acc[4],@acc[5],[sp,#80] + str @acc[6],[sp,#96] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + add $b_ptr,$a_ptr,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds @acc[0],@a[0],@a[0] // add with itself + adcs @acc[1],@a[1],@a[1] + adcs @acc[2],@a[2],@a[2] + adcs @acc[3],@a[3],@a[3] + adcs @acc[4],@a[4],@a[4] + adc @acc[5],@a[5],@a[5] + + stp @acc[0],@acc[1],[$b_ptr,#48] + stp @acc[2],@acc[3],[$b_ptr,#64] + stp @acc[4],@acc[5],[$b_ptr,#80] + + ldp @a[0],@a[1],[sp] + ldr $bi,[sp,#48] + ldp @a[2],@a[3],[sp,#16] + ldp @a[4],@a[5],[sp,#32] + + add $b_ptr,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr @acc[6],[sp,#96] // account for sign from a->re - a->im + ldp @acc[0],@acc[1],[sp] + ldp @acc[2],@acc[3],[sp,#16] + ldp @acc[4],@acc[5],[sp,#32] + + and @acc[0],@acc[0],@acc[6] + and @acc[1],@acc[1],@acc[6] + and @acc[2],@acc[2],@acc[6] + and @acc[3],@acc[3],@acc[6] + and @acc[4],@acc[4],@acc[6] + and @acc[5],@acc[5],@acc[6] + + subs @a[0],@a[0],@acc[0] + sbcs @a[1],@a[1],@acc[1] + sbcs @a[2],@a[2],@acc[2] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc @acc[6],xzr,xzr + + and @acc[0],@mod[0],@acc[6] + and @acc[1],@mod[1],@acc[6] + and @acc[2],@mod[2],@acc[6] + and @acc[3],@mod[3],@acc[6] + and @acc[4],@mod[4],@acc[6] + and @acc[5],@mod[5],@acc[6] + + adds @a[0],@a[0],@acc[0] + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adc @a[5],@a[5],@acc[5] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + mul $n0,$n0,@acc[0] + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + umulh @tmp[4],@a[4],$bi + umulh @tmp[5],@a[5],$bi + + adds @acc[1],@acc[1],@tmp[0] + mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],xzr, @tmp[5] + mul @tmp[5],@mod[5],$n0 +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + ldr $bi,[$b_ptr,8*$i] + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adc @acc[6],@acc[6],xzr + + ldr $n0,[x29,#96] + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adcs @acc[4],@acc[5],@tmp[4] + mul @tmp[4],@a[4],$bi + adcs @acc[5],@acc[6],@tmp[5] + mul @tmp[5],@a[5],$bi + adc @acc[6],xzr,xzr + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $n0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@a[4],$bi + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@a[5],$bi + adc @acc[6],@acc[6],xzr + + adds @acc[1],@acc[1],@tmp[0] + mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],@acc[6],@tmp[5] + mul @tmp[5],@mod[5],$n0 +___ +} +$code.=<<___; + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adc @acc[6],@acc[6],xzr + ldp $n0,$b_ptr,[x29,#96] // pull r_ptr + + adds @a[0],@acc[1],@tmp[0] + adcs @a[1],@acc[2],@tmp[1] + adcs @a[2],@acc[3],@tmp[2] + adcs @a[3],@acc[4],@tmp[3] + adcs @a[4],@acc[5],@tmp[4] + adcs @a[5],@acc[6],@tmp[5] + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov $n0,$b_ptr + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + mov $a_ptr,$r_ptr + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and $r_ptr,@a[0],#1 + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $r_ptr,$r_ptr,$bi + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov $n0,$b_ptr + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + mov $a_ptr,$r_ptr + + bl __mul_by_1_mont_384 + add $a_ptr,$a_ptr,#48 + + and $b_ptr,@a[0],#1 + orr $n_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $n_ptr,$n_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $n_ptr,$n_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $n_ptr,$n_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $n_ptr,$n_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $b_ptr,$b_ptr,$bi + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and $r_ptr,@a[0],#1 + orr $a_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $a_ptr,$a_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $a_ptr,$a_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $a_ptr,$a_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $a_ptr,$a_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $r_ptr,$r_ptr,$bi + + cmp $n_ptr,#0 + csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp $a_ptr,#0 + csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and $n_ptr,$n_ptr,#1 + and $a_ptr,$a_ptr,#2 + orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ + +if (0) { +my @b = ($bi, @mod[0..4]); +my @comba = @acc[4..6]; + +$code.=<<___; +.type __mul_384_comba,%function +.align 5 +__mul_384_comba: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @b[4],@b[5],[$b_ptr,#32] + + mul @comba[0],@a[0],@b[0] + umulh @comba[1],@a[0],@b[0] + mul @acc[0],@a[1],@b[0] + umulh @acc[1],@a[1],@b[0] + str @comba[0],[$r_ptr] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[0],@b[1] + umulh @acc[3],@a[0],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],xzr, @acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[2],@b[0] + umulh @acc[1],@a[2],@b[0] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#8] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[1],@b[1] + umulh @acc[3],@a[1],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[0],@b[2] + umulh @acc[1],@a[0],@b[2] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[3],@b[0] + umulh @acc[3],@a[3],@b[0] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#16] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[2],@b[1] + umulh @acc[1],@a[2],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[1],@b[2] + umulh @acc[3],@a[1],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[0],@b[3] + umulh @acc[1],@a[0],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[4],@b[0] + umulh @acc[3],@a[4],@b[0] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#24] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[3],@b[1] + umulh @acc[1],@a[3],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[2],@b[2] + umulh @acc[3],@a[2],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[3] + umulh @acc[1],@a[1],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[0],@b[4] + umulh @acc[3],@a[0],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[0] + umulh @acc[1],@a[5],@b[0] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#32] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[1] + umulh @acc[3],@a[4],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[3],@b[2] + umulh @acc[1],@a[3],@b[2] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[2],@b[3] + umulh @acc[3],@a[2],@b[3] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[4] + umulh @acc[1],@a[1],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[0],@b[5] + umulh @acc[3],@a[0],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[1] + umulh @acc[1],@a[5],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#40] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[2] + umulh @acc[3],@a[4],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[3],@b[3] + umulh @acc[1],@a[3],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[2],@b[4] + umulh @acc[3],@a[2],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[5] + umulh @acc[1],@a[1],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[5],@b[2] + umulh @acc[3],@a[5],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#48] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[4],@b[3] + umulh @acc[1],@a[4],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[3],@b[4] + umulh @acc[3],@a[3],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[2],@b[5] + umulh @acc[1],@a[2],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[5],@b[3] + umulh @acc[3],@a[5],@b[3] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#56] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[4],@b[4] + umulh @acc[1],@a[4],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[3],@b[5] + umulh @acc[3],@a[3],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[4] + umulh @acc[1],@a[5],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#64] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[5] + umulh @acc[3],@a[4],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[5],@b[5] + umulh @acc[1],@a[5],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#72] +___ + push(@comba,shift(@comba)); +$code.=<<___; + adds @comba[0],@comba[0],@acc[0] + adc @comba[1],@comba[1],@acc[1] + stp @comba[0],@comba[1],[$r_ptr,#80] + + ret +.size __mul_384_comba,.-__mul_384_comba +___ +} +print $code; + +close STDOUT; diff --git a/crypto/blst_src/asm/mulq_mont_256-x86_64.pl b/crypto/blst_src/asm/mulq_mont_256-x86_64.pl new file mode 100755 index 00000000000..12e58bb001e --- /dev/null +++ b/crypto/blst_src/asm/mulq_mont_256-x86_64.pl @@ -0,0 +1,513 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# As for "sparse" in subroutine names, see commentary in the +# asm/mulx_mont_256-x86_64.pl module. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits +my @acc=map("%r$_",(9..15)); + +{ ############################################################## mulq +my ($hi, $a0) = ("%rbp", $r_ptr); + +$code.=<<___; +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,\@function,5,"unwind" +.align 32 +mul_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[4] + mov 8*1($a_ptr), @acc[5] + mov 8*2($a_ptr), @acc[3] + mov 8*3($a_ptr), $hi + mov $b_org, $b_ptr # evacuate from %rdx + + mov %rax, @acc[6] + mulq @acc[4] # a[0]*b[0] + mov %rax, @acc[0] + mov @acc[6], %rax + mov %rdx, @acc[1] + call __mulq_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,\@function,4,"unwind" +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), %rax + mov $n_ptr, $n0 + mov 8*1($a_ptr), @acc[5] + mov $b_org, $n_ptr + mov 8*2($a_ptr), @acc[3] + lea ($a_ptr), $b_ptr + mov 8*3($a_ptr), $hi + + mov %rax, @acc[6] + mulq %rax # a[0]*a[0] + mov %rax, @acc[0] + mov @acc[6], %rax + mov %rdx, @acc[1] + call __mulq_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +___ +{ +my @acc=@acc; +$code.=<<___; +.type __mulq_mont_sparse_256,\@abi-omnipotent +.align 32 +__mulq_mont_sparse_256: + mulq @acc[5] # a[1]*b[0] + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[3] # a[2]*b[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq $hi # a[3]*b[0] + add %rax, @acc[3] + mov 8($b_ptr), %rax + adc \$0, %rdx + xor @acc[5], @acc[5] + mov %rdx, @acc[4] + +___ +for (my $i=1; $i<4; $i++) { +my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], $a0 + imulq $n0, @acc[0] + + ################################# Multiply by b[$i] + mov %rax, @acc[6] + mulq 8*0($a_ptr) + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*1($a_ptr) + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($a_ptr) + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($a_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc %rdx, @acc[5] # can't overflow + xor @acc[6], @acc[6] + + ################################# reduction + mulq 8*0($n_ptr) + add %rax, $a0 # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $a0 + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $a0, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + add %rdx, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq $n0, %rax + mov 8(%rsp), $a_ptr # restore $r_ptr + + ################################# last reduction + mov %rax, @acc[6] + mulq 8*0($n_ptr) + add %rax, @acc[0] # guaranteed to be zero + mov @acc[6], %rax + adc %rdx, @acc[0] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + add @acc[0], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + mov @acc[2], $b_ptr + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add %rdx, @acc[4] + adc \$0, @acc[5] + + ################################# + # Branch-less conditional subtraction of modulus + + mov @acc[3], @acc[0] + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + sbb 8*2($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*3($n_ptr), @acc[4] + sbb \$0, @acc[5] + + cmovc %rax, @acc[1] + cmovc $b_ptr, @acc[2] + cmovc @acc[0], @acc[3] + mov @acc[1], 8*0($a_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*1($a_ptr) + mov @acc[3], 8*2($a_ptr) + mov @acc[4], 8*3($a_ptr) + + ret +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +___ +} } +{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,\@function,4,"unwind" +.align 32 +from_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_256 + + ################################# + # Branch-less conditional acc[0:3] - modulus + + #mov @acc[4], %rax # __mulq_by_1_mont_256 does it + mov @acc[5], @acc[1] + mov @acc[6], @acc[2] + mov @acc[0], @acc[3] + + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[6] + sbb 8*3($n_ptr), @acc[0] + + cmovnc @acc[4], %rax + cmovnc @acc[5], @acc[1] + cmovnc @acc[6], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[0], @acc[3] + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,\@function,4,"unwind" +.align 32 +redc_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_256 + + add 8*4($a_ptr), @acc[4] # accumulate upper half + adc 8*5($a_ptr), @acc[5] + mov @acc[4], %rax + adc 8*6($a_ptr), @acc[6] + mov @acc[5], @acc[1] + adc 8*7($a_ptr), @acc[0] + sbb $a_ptr, $a_ptr + + ################################# + # Branch-less conditional acc[0:4] - modulus + + mov @acc[6], @acc[2] + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[6] + mov @acc[0], @acc[3] + sbb 8*3($n_ptr), @acc[0] + sbb \$0, $a_ptr + + cmovnc @acc[4], %rax + cmovnc @acc[5], @acc[1] + cmovnc @acc[6], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[0], @acc[3] + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +___ +{ +my @acc=@acc; + +$code.=<<___; +.type __mulq_by_1_mont_256,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_256: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + mov %rax, @acc[4] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<4; $i++) { +my $hi = @acc[4]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[4] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[4] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[4], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) +___ +$code.=<<___ if ($i<3); + mov @acc[1], @acc[5] + imulq $n0, @acc[1] +___ +$code.=<<___; + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 +___ +} } } + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/mulq_mont_384-x86_64.pl b/crypto/blst_src/asm/mulq_mont_384-x86_64.pl new file mode 100755 index 00000000000..3812319e8ba --- /dev/null +++ b/crypto/blst_src/asm/mulq_mont_384-x86_64.pl @@ -0,0 +1,2675 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +######################################################################## +{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +######################################################################## +# Double-width subtraction modulo n<<384, as opposite to naively +# expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__sub_mod_384_a_is_loaded: + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 +___ +} + +######################################################################## +# "Complex" multiplication and squaring. Use vanilla multiplication when +# possible to fold reductions. I.e. instead of mul_mont, mul_mont +# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod +# followed by *common* reduction... +{ my $frame = 5*8 + # place for argument off-load + + 3*768/8; # place for 3 768-bit temporary vectors +$code.=<<___; +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,\@function,5,"unwind" +.align 32 +mul_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $b_org, $b_ptr + mov $r_ptr, 8*4(%rsp) # offload arguments + mov $a_ptr, 8*3(%rsp) + mov $b_org, 8*2(%rsp) + mov $n_ptr, 8*1(%rsp) + mov $n0, 8*0(%rsp) + + ################################# mul_384(t0, a->re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulq_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 48($a_ptr), $a_ptr # a->im + lea 40+96(%rsp), $r_ptr # t1 + call __mulq_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea -48($a_ptr), $b_org + lea 40+192+48(%rsp), $r_ptr + call __add_mod_384 + + mov 8*2(%rsp), $a_ptr + lea 48($a_ptr), $b_org + lea -48($r_ptr), $r_ptr + call __add_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulq_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __sub_mod_384x384 # t2=t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __sub_mod_384x384 # t2=t2-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __sub_mod_384x384 # t0-t1 + + mov $n_ptr, $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +$code.=<<___; +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,\@function,4,"unwind" +.align 32 +sqr_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $a_ptr, 8*2(%rsp) + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __add_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __sub_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + call __mulq_mont_384 +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($r_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,\@function,4,"unwind" +.align 32 +mul_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulq_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulq_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulq_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __sub_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __sub_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __sub_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_382x,.-mul_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,\@function,3,"unwind" +.align 32 +sqr_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __sub_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulq_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulq_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_382x,.-sqr_382x +___ +} +{ ########################################################## 384-bit mul +my @acc=map("%r$_",("cx",8..12)); +my $bi = "%rbp"; + +$code.=<<___; +.globl mul_384 +.hidden mul_384 +.type mul_384,\@function,3,"unwind" +.align 32 +mul_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org, $b_ptr + call __mulq_384 + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,\@abi-omnipotent +.align 32 +__mulq_384: + mov 8*0($b_ptr), %rax + + mov %rax, $bi + mulq 8*0($a_ptr) + mov %rax, 8*0($r_ptr) + mov $bi, %rax + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[4] + mov 8*1($b_ptr), %rax + adc \$0, %rdx + mov %rdx, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov @acc[0], 8*$i($r_ptr) + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[1], @acc[0] + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[2], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[3], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[4], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[5], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulq_384,.-__mulq_384 +___ +} +if (0) { ############################################################## +my @b=map("%r$_",(10..15)); +my @a=reverse(@b); + @b[5]=$b_ptr; +my $bi = "%rbp"; +my @comba=map("%r$_",("cx",8,9)); +# a[0]*b[0] +# a[1]*b[0] +# a[0]*b[1] +# a[2]*b[0] +# a[1]*b[1] +# a[0]*b[2] +# a[3]*b[0] +# a[2]*b[1] +# a[1]*b[2] +# a[0]*b[3] +# a[4]*b[0] +# a[3]*b[1] +# a[2]*b[2] +# a[1]*b[3] +# a[0]*b[4] +# a[5]*b[0] +# a[4]*b[1] +# a[3]*b[2] +# a[2]*b[3] +# a[1]*b[4] +# a[0]*b[5] +# a[5]*b[1] +# a[4]*b[2] +# a[3]*b[3] +# a[2]*b[4] +# a[1]*b[5] +# a[5]*b[2] +# a[4]*b[3] +# a[3]*b[4] +# a[2]*b[5] +# a[5]*b[3] +# a[4]*b[4] +# a[3]*b[5] +# a[5]*b[4] +# a[4]*b[5] +# a[5]*b[5] +# +# 13% less instructions give +15% on Core2, +10% on Goldmont, +# -0% on Sandy Bridge, but -16% on Haswell:-( +# [for reference +5% on Skylake, +11% on Ryzen] + +$code.=<<___; +.type __mulq_comba_384,\@abi-omnipotent +.align 32 +__mulq_comba_384: + mov 8*0($b_ptr), %rax + mov 8*0($a_ptr), @a[0] + mov 8*1($a_ptr), @a[1] + mov 8*1($b_ptr), @b[1] + + mov %rax, @b[0] + mulq @a[0] # a[0]*b[0] + mov %rax, 8*0($r_ptr) + mov @b[0], %rax + mov %rdx, @comba[0] + + ################################# + mov 8*2($a_ptr), @a[2] + xor @comba[2], @comba[2] + mulq @a[1] # a[1]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc \$0, %rdx + mov 8*2($b_ptr), @b[2] + mov %rdx, @comba[1] + + mulq @a[0] # a[0]*b[1] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*1($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[2] # a[2]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[0] # a[0]*b[2] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*2($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*3($a_ptr) # a[3]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[2] # a[2]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[2] + add %rax, @comba[0] + mov 8*3($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[3] + mulq @a[0] # a[0]*b[3] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*3($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*4($a_ptr) # a[4]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[3] + add %rax, @comba[0] + mov 8*4($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[4] + mulq @a[0] # a[0]*b[4] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + mov 8*5($a_ptr), @a[5] + adc \$0, @comba[2] + mov @comba[0], 8*4($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*4($a_ptr) # a[4]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[4] + add %rax, @comba[0] + mov 8*5($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[5] + mulq @a[0] # a[0]*b[5] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + mov 8*4($a_ptr), @a[4] + adc \$0, @comba[2] + mov @comba[0], 8*5($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[5] + add %rax, @comba[0] + mov $b[2], %rax + adc %rdx, @comba[1] + mov 8*3($a_ptr), @a[3] + adc \$0, @comba[2] + mov @comba[0], 8*6($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[5] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*7($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[5] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*8($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[5] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*9($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + adc %rdx, @comba[1] + + mov @comba[0], 8*10($r_ptr) + mov @comba[1], 8*11($r_ptr) + + ret +.size __mulq_comba_384,.-__mulq_comba_384 +___ +} +{ ########################################################## 384-bit sqr +my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr); +my $hi; + +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,\@function,2,"unwind" +.align 32 +sqr_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrq_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,\@abi-omnipotent +.align 32 +__sqrq_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + + ######################################### + mov %rax, @acc[6] + mulq @acc[7] # a[1]*a[0] + mov %rax, @acc[1] + mov @acc[6], %rax + mov 8*4($a_ptr), @acc[10] + mov %rdx, @acc[2] + + mulq @acc[8] # a[2]*a[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov 8*5($a_ptr), @acc[11] + mov %rdx, @acc[3] + + mulq @acc[9] # a[3]*a[0] + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq @acc[10] # a[4]*a[0] + add %rax, @acc[4] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq @acc[11] # a[5]*a[0] + add %rax, @acc[5] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq %rax # a[0]*a[0] + xor @acc[0], @acc[0] + mov %rax, 8*0($r_ptr) + mov @acc[7], %rax + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[0] + add %rdx, @acc[1] # accumulate a[0]*a[0] + adc \$0, @acc[0] # carries to a[1]*a[1] + mov @acc[1], 8*1($r_ptr) +___ +$hi=@acc[1]; +$code.=<<___; + ######################################### + mulq @acc[8] # a[2]*a[1] + add %rax, @acc[3] + mov @acc[7], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[9] # a[3]*a[1] + add %rax, @acc[4] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[1] + add %rax, @acc[5] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[1] + add %rax, @acc[6] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq %rax # a[1]*a[1] + xor @acc[1], @acc[1] + add %rax, @acc[0] # can't carry + mov @acc[8], %rax + add @acc[2], @acc[2] # double acc[2:3] + adc @acc[3], @acc[3] + adc \$0, @acc[1] + add @acc[0], @acc[2] # accumulate a[1]*a[1] + adc %rdx, @acc[3] + adc \$0, @acc[1] # carries to a[2]*a[2] + mov @acc[2], 8*2($r_ptr) +___ +$hi=@acc[0]; +$code.=<<___; + ######################################### + mulq @acc[9] # a[3]*a[2] + add %rax, @acc[5] + mov @acc[8], %rax + adc \$0, %rdx + mov @acc[3], 8*3($r_ptr) + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[2] + add %rax, @acc[6] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[2] + add %rax, @acc[7] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[7] + adc \$0, %rdx + mov %rdx, @acc[8] + + mulq %rax # a[2]*a[2] + xor @acc[3], @acc[3] + add %rax, @acc[1] # can't carry + mov @acc[9], %rax + add @acc[4], @acc[4] # double acc[4:5] + adc @acc[5], @acc[5] + adc \$0, @acc[3] + add @acc[1], @acc[4] # accumulate a[2]*a[2] + adc %rdx, @acc[5] + adc \$0, @acc[3] # carries to a[3]*a[3] + mov @acc[4], 8*4($r_ptr) + + ######################################### + mulq @acc[10] # a[4]*a[3] + add %rax, @acc[7] + mov @acc[9], %rax + adc \$0, %rdx + mov @acc[5], 8*5($r_ptr) + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[3] + add %rax, @acc[8] + mov @acc[9], %rax + adc \$0, %rdx + add $hi, @acc[8] + adc \$0, %rdx + mov %rdx, @acc[9] + + mulq %rax # a[3]*a[3] + xor @acc[4], @acc[4] + add %rax, @acc[3] # can't carry + mov @acc[10], %rax + add @acc[6], @acc[6] # double acc[6:7] + adc @acc[7], @acc[7] + adc \$0, @acc[4] + add @acc[3], @acc[6] # accumulate a[3]*a[3] + adc %rdx, @acc[7] + mov @acc[6], 8*6($r_ptr) + adc \$0, @acc[4] # carries to a[4]*a[4] + mov @acc[7], 8*7($r_ptr) + + ######################################### + mulq @acc[11] # a[5]*a[4] + add %rax, @acc[9] + mov @acc[10], %rax + adc \$0, %rdx + mov %rdx, @acc[10] + + mulq %rax # a[4]*a[4] + xor @acc[5], @acc[5] + add %rax, @acc[4] # can't carry + mov @acc[11], %rax + add @acc[8], @acc[8] # double acc[8:9] + adc @acc[9], @acc[9] + adc \$0, @acc[5] + add @acc[4], @acc[8] # accumulate a[4]*a[4] + adc %rdx, @acc[9] + mov @acc[8], 8*8($r_ptr) + adc \$0, @acc[5] # carries to a[5]*a[5] + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulq %rax # a[5]*a[5] + add @acc[5], %rax # can't carry + add @acc[10], @acc[10] # double acc[10] + adc \$0, %rdx + add @acc[10], %rax # accumulate a[5]*a[5] + adc \$0, %rdx + mov %rax, 8*10($r_ptr) + mov %rdx, 8*11($r_ptr) + + ret +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,\@function,4,"unwind" +.align 32 +sqr_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*15, %rsp +.cfi_adjust_cfa_offset 8*15 +.cfi_end_prologue + + mov $n_ptr, 8*12(%rsp) # n0 + mov $b_org, 8*13(%rsp) # n_ptr + mov $r_ptr, 8*14(%rsp) + + mov %rsp, $r_ptr + call __sqrq_384 + + lea 0(%rsp), $a_ptr + mov 8*12(%rsp), %rcx # n0 for mul_by_1 + mov 8*13(%rsp), $b_ptr # n_ptr for mul_by_1 + mov 8*14(%rsp), $r_ptr + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea 8*15(%rsp), %r8 # size optimization + mov 8*15(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*21 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 +___ +} +{ ########################################################## 384-bit redc_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +######################################################################## +# void redc_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,\@function,4,"unwind" +.align 32 +redc_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + +######################################################################## +# void from_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,\@function,4,"unwind" +.align 32 +from_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[6], %rax # __mulq_by_1_mont_384 does it + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_384,.-from_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov %rax, @acc[6] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<6; $i++) { +my $hi = @acc[6]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[6] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[6] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx +___ +$code.=<<___ if ($i<5); + mov @acc[1], @acc[7] + imulq $n0, @acc[1] +___ +$code.=<<___; + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,\@abi-omnipotent +.align 32 +__redc_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ +} } + +{ ########################################################## mulq_mont +my ($bi, $hi) = ("%rdi", "%rbp"); + +$code.=<<___; +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,\@function,5,"unwind" +.align 32 +mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*3, %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + mov $b_org, $b_ptr # evacuate from %rdx + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + + call __mulq_mont_384 + + mov 24(%rsp),%r15 +.cfi_restore %r15 + mov 32(%rsp),%r14 +.cfi_restore %r14 + mov 40(%rsp),%r13 +.cfi_restore %r13 + mov 48(%rsp),%r12 +.cfi_restore %r12 + mov 56(%rsp),%rbx +.cfi_restore %rbx + mov 64(%rsp),%rbp +.cfi_restore %rbp + lea 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_mont_384,\@abi-omnipotent +.align 32 +__mulq_mont_384: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + xor @acc[7], @acc[7] + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, $hi # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $hi + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[5] + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($a_ptr) + add @acc[7], @acc[5] + adc \$0, %rdx + xor @acc[7], @acc[7] + add %rax, @acc[5] + mov @acc[0], %rax + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ +} +$code.=<<___; + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[0], %rax + mov 8*2(%rsp), $r_ptr # restore $r_ptr + sub 8*0($n_ptr), @acc[0] + mov @acc[1], %rdx + sbb 8*1($n_ptr), @acc[1] + mov @acc[2], $b_ptr + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*3($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[7] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rdx, @acc[1] + cmovc $b_ptr, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[7], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __mulq_mont_384,.-__mulq_mont_384 +___ +} } +$code.=<<___; +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $n_ptr, 8*2(%rsp) + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_384: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1, %edx + lea 0($r_ptr), $a_ptr + dec %edx + jnz .Loop_sqr_384 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #mov 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 + + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $n_ptr, 8*2(%rsp) + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_383: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + + movd %xmm1, %edx # loop counter + add 8*6($a_ptr), @acc[6] # just accumulate upper half + adc 8*7($a_ptr), @acc[7] + adc 8*8($a_ptr), @acc[0] + adc 8*9($a_ptr), @acc[1] + adc 8*10($a_ptr), @acc[2] + adc 8*11($a_ptr), @acc[3] + lea 0($r_ptr), $a_ptr + + mov @acc[6], 8*0($r_ptr) # omitting full reduction gives ~5% + mov @acc[7], 8*1($r_ptr) # in addition-chains + mov @acc[0], 8*2($r_ptr) + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + dec %edx + jnz .Loop_sqr_383 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #movq 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 # formally one can omit full reduction + # even after multiplication... + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + my $bi = "%rbp"; + +$code.=<<___; +.type __mulq_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulq_mont_383_nonred: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[7] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[7] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*3($n_ptr) + add @acc[7], @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[7], @acc[5] + adc %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[6] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*5($a_ptr) + add @acc[6], @acc[5] + adc \$0, %rdx + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +} +$code.=<<___; + ret +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives 8-11% better performance in add-chains +$code.=<<___; +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,\@function,4,"unwind" +.align 32 +sqr_mont_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $a_ptr, 8*2(%rsp) + mov $r_ptr, 8*3(%rsp) + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + mov 8*3(%rsp), $r_ptr + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($r_ptr) # ret->im + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + mov 32+8*0(%rsp), @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[6] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[7] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[8] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($r_ptr) # ret->re + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/mulx_mont_256-x86_64.pl b/crypto/blst_src/asm/mulx_mont_256-x86_64.pl new file mode 100755 index 00000000000..0d6bf2e465c --- /dev/null +++ b/crypto/blst_src/asm/mulx_mont_256-x86_64.pl @@ -0,0 +1,486 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# "Sparse" in subroutine names refers to most significant limb of the +# modulus. Though "sparse" is a bit of misnomer, because limitation is +# just not-all-ones. Or in other words not larger than 2^256-2^192-1. +# In general Montgomery multiplication algorithm can handle one of the +# inputs being non-reduced and capped by 1<re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulx_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 128+48($a_ptr), $a_ptr # a->im + lea 96($r_ptr), $r_ptr # t1 + call __mulx_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea ($b_ptr), $a_ptr # b->re + lea -48($b_ptr), $b_org # b->im + lea 40+192+48(%rsp), $r_ptr + call __add_mod_384 + + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea -48($r_ptr), $r_ptr + call __add_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulx_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __sub_mod_384x384 # t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __sub_mod_384x384 # t2-t0-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __sub_mod_384x384 # t0-t1 + + lea ($n_ptr), $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # alignment +$code.=<<___; +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,\@function,4,"unwind" +.align 32 +sqrx_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + # gap for __mulx_mont_384 + mov $r_ptr, 8*2(%rsp) + mov $a_ptr, 8*3(%rsp) + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __add_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __sub_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $a_ptr, $a_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $a_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($b_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($b_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($b_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,\@function,4,"unwind" +.align 32 +mulx_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulx_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulx_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48+128($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulx_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __sub_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __sub_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __sub_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_382x,.-mulx_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,\@function,3,"unwind" +.align 32 +sqrx_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __sub_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulx_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulx_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +___ +} +{ ########################################################## 384-bit mulx +my ($a0, $a1) = @acc[6..7]; +my @acc = @acc[0..5]; +my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp"); + +$code.=<<___; +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,\@function,3,"unwind" +.align 32 +mulx_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + call __mulx_384 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,\@abi-omnipotent +.align 32 +__mulx_384: + mov 8*0($b_ptr), %rdx + mov 8*0($a_ptr), $a0 + mov 8*1($a_ptr), $a1 + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + lea -128($a_ptr), $a_ptr + + mulx $a0, @acc[1], $hi + xor $zr, $zr + + mulx $a1, @acc[0], $lo + adcx $hi, @acc[0] + mov @acc[1], 8*0($r_ptr) + + mulx @acc[2], @acc[1], $hi + adcx $lo, @acc[1] + + mulx @acc[3], @acc[2], $lo + adcx $hi, @acc[2] + + mulx @acc[4], @acc[3], $hi + adcx $lo, @acc[3] + + mulx @acc[5], @acc[4], @acc[5] + mov 8*1($b_ptr), %rdx + adcx $hi, @acc[4] + adcx $zr, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mulx $a0, $lo, $hi + adcx @acc[0], $lo + adox $hi, @acc[1] + mov $lo, 8*$i($r_ptr) + + mulx $a1, @acc[0], $hi + adcx @acc[1], $acc[0] + adox $hi, @acc[2] + + mulx 128+8*2($a_ptr), @acc[1], $lo + adcx @acc[2], @acc[1] + adox $lo, @acc[3] + + mulx 128+8*3($a_ptr), @acc[2], $hi + adcx @acc[3], @acc[2] + adox $hi, @acc[4] + + mulx 128+8*4($a_ptr), @acc[3], $lo + adcx @acc[4], @acc[3] + adox @acc[5], $lo + + mulx 128+8*5($a_ptr), @acc[4], @acc[5] + mov $b_next, %rdx + adcx $lo, @acc[4] + adox $zr, @acc[5] + adcx $zr, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulx_384,.-__mulx_384 +___ +} +{ ########################################################## 384-bit sqrx +$code.=<<___; +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,\@function,2,"unwind" +.align 32 +sqrx_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrx_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_384,.-sqrx_384 +___ +if (0) { +# up to 5% slower than below variant +my @acc=map("%r$_",("no",8..15,"cx","bx")); + push(@acc, $a_ptr); +my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + adc $hi, @acc[5] + adc \$0, @acc[6] + + mulx %rdx, $lo, $hi # a[0]*a[0] + mov @acc[7], %rdx + xor @acc[7], @acc[7] + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[7] + add $hi, @acc[1] + adc \$0, @acc[7] + mov $lo, 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) +___ +($carry, @acc[7]) = (@acc[7], @acc[1]); +$code.=<<___; + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + mulx %rdx, $lo, $hi # a[1]*a[1] + mov @acc[8], %rdx + xor @acc[8], @acc[8] + adox @acc[2], @acc[2] # double acc[2:3] + adcx $carry, $lo # can't carry + adox @acc[3], @acc[3] + adcx $lo, @acc[2] + adox @acc[8], @acc[8] + adcx $hi, @acc[3] + adc \$0, @acc[8] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) +___ +($carry,@acc[8])=(@acc[8],$carry); +$code.=<<___; + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + mulx %rdx, $lo, $hi # a[2]*a[2] + mov @acc[9], %rdx + xor @acc[9], @acc[9] + adox @acc[4], @acc[4] # double acc[4:5] + adcx $carry, $lo # can't carry + adox @acc[5], @acc[5] + adcx $lo, @acc[4] + adox @acc[9], @acc[9] + adcx $hi, @acc[5] + adc \$0, $acc[9] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +($carry,@acc[9])=(@acc[9],$carry); +$code.=<<___; + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + mulx %rdx, $lo, $hi + mov @acc[10], %rdx + xor @acc[10], @acc[10] + adox @acc[6], @acc[6] # double acc[6:7] + adcx $carry, $lo # can't carry + adox @acc[7], @acc[7] + adcx $lo, @acc[6] + adox @acc[10], @acc[10] + adcx $hi, @acc[7] + adc \$0, $acc[10] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) +___ +($carry,@acc[10])=(@acc[10],$carry); +$code.=<<___; + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + add $lo, @acc[9] + adc \$0, @acc[10] + + mulx %rdx, $lo, $hi # a[4]*a[4] + mov @acc[11], %rdx + xor @acc[11], @acc[11] + adox @acc[8], @acc[8] # double acc[8:10] + adcx $carry, $lo # can't carry + adox @acc[9], @acc[9] + adcx $lo, @acc[8] + adox @acc[10], @acc[10] + adcx $hi, @acc[9] + adox @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulx %rdx, $lo, $hi # a[5]*a[5] + adcx $lo, @acc[10] + adcx $hi, @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} else { +my @acc=map("%r$_",("no",8..15,"cx","bx","bp")); +my ($lo, $hi)=($r_ptr, "%rax"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + mov @acc[7], %rdx + adc $hi, @acc[5] + adc \$0, @acc[6] + + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + mov @acc[8], %rdx + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + mov @acc[9], %rdx + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + mov @acc[10], %rdx + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + mov 8*0($a_ptr), %rdx + add $lo, @acc[9] + mov 8(%rsp), $r_ptr # restore $r_ptr + adc \$0, @acc[10] + + ######################################### double acc[1:10] + xor @acc[11], @acc[11] + adcx @acc[1], @acc[1] + adcx @acc[2], @acc[2] + adcx @acc[3], @acc[3] + adcx @acc[4], @acc[4] + adcx @acc[5], @acc[5] + + ######################################### accumulate a[i]*a[i] + mulx %rdx, %rdx, $hi # a[0]*a[0] + mov %rdx, 8*0($r_ptr) + mov 8*1($a_ptr), %rdx + adox $hi, @acc[1] + mov @acc[1], 8*1($r_ptr) + + mulx %rdx, @acc[1], $hi # a[1]*a[1] + mov 8*2($a_ptr), %rdx + adox @acc[1], @acc[2] + adox $hi, @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[2]*a[2] + mov 8*3($a_ptr), %rdx + adox @acc[1], @acc[4] + adox @acc[2], @acc[5] + adcx @acc[6], @acc[6] + adcx @acc[7], @acc[7] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[3]*a[3] + mov 8*4($a_ptr), %rdx + adox @acc[1], @acc[6] + adox @acc[2], @acc[7] + adcx @acc[8], @acc[8] + adcx @acc[9], @acc[9] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[4]*a[4] + mov 8*5($a_ptr), %rdx + adox @acc[1], @acc[8] + adox @acc[2], @acc[9] + adcx @acc[10], @acc[10] + adcx @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[5]*a[5] + adox @acc[1], @acc[10] + adox @acc[2], @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} + +{ ########################################################## 384-bit redcx_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" +my ($lo, $hi) = ("%rax", "%rbp"); + +$code.=<<___; +######################################################################## +# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,\@function,4,"unwind" +.align 32 +redcx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + +######################################################################## +# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,\@function,4,"unwind" +.align 32 +fromx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[6], %rax + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulx_by_1_mont_384: + mov 8*0($a_ptr), @acc[0] + mov $n0, %rdx + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] +___ +for (my $i=0; $i<6; $i++) { +$code.=<<___; + imulq @acc[0], %rdx + + ################################# reduction $i + xor @acc[6], @acc[6] # @acc[6]=0, cf=0, of=0 + mulx 8*0($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5($n_ptr), $lo, $hi + mov $n0, %rdx + adcx $lo, @acc[5] + adox @acc[6], $hi + adcx $hi, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,\@abi-omnipotent +.align 32 +__redc_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulx_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +___ +} } + +{ ########################################################## mulx/sqrx_mont +my @acc = (@acc, "%rax"); +my ($lo,$hi)=("%rdi","%rbp"); + +$code.=<<___; +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,\@function,5,"unwind" +.align 32 +mulx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + mov $n0, (%rsp) + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_384,\@abi-omnipotent +.align 32 +__mulx_mont_384: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] + xor @acc[7], @acc[7] + +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], 16(%rsp) + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx $hi, @acc[7] # cf=0 + adox @acc[8], @acc[7] + adox @acc[8], @acc[8] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx 16(%rsp), $lo # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[0], @acc[6] + adox @acc[0], @acc[7] + adcx @acc[0], @acc[7] + adox @acc[0], @acc[8] + adcx @acc[0], @acc[8] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + mov 8*3(%rsp), $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + mov @acc[2], @acc[0] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + mov @acc[3], $a_ptr + + mulx 8*5+128($n_ptr), $lo, $hi + adcx $lo, @acc[5] + adox $hi, @acc[6] + mov @acc[1], %rdx + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + lea 128($n_ptr), $n_ptr + mov @acc[4], @acc[8] + adc \$0, @acc[7] + + ################################# + # Branch-less conditional acc[1:7] - modulus + + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + mov @acc[5], $lo + sbb 8*2($n_ptr), @acc[3] + sbb 8*3($n_ptr), @acc[4] + sbb 8*4($n_ptr), @acc[5] + mov @acc[6], $hi + sbb 8*5($n_ptr), @acc[6] + sbb \$0, @acc[7] + + cmovnc @acc[1], %rdx + cmovc @acc[0], @acc[2] + cmovc $a_ptr, @acc[3] + cmovnc @acc[4], @acc[8] + mov %rdx, 8*0($b_ptr) + cmovnc @acc[5], $lo + mov @acc[2], 8*1($b_ptr) + cmovnc @acc[6], $hi + mov @acc[3], 8*2($b_ptr) + mov @acc[8], 8*3($b_ptr) + mov $lo, 8*4($b_ptr) + mov $hi, 8*5($b_ptr) + + ret +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +___ +} +$code.=<<___; +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,\@function,4,"unwind" +.align 32 +sqrx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $n_ptr, $n0 # n0 + lea -128($b_org), $n_ptr # control u-op density + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + lea ($a_ptr), $b_ptr + mov $n0, (%rsp) # n0 + lea -128($a_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 # as fast as dedicated squaring + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp), %rsp +.cfi_adjust_cfa_offset 8*5 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) # to __mulx_mont_384 + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + +.Loop_sqrx_384: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 + + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_384 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + lea -128($n_ptr), $n_ptr # control u-op density + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp), %rsp +.cfi_adjust_cfa_offset 8*5 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) # to __mulx_mont_383_nonred + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + lea -128($n_ptr), $n_ptr # control u-op density + +.Loop_sqrx_383: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_383_nonred # omitting full reduction gives ~15% + # in addition-chains + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_383 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + mulx @acc[6], @acc[0], @acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], @acc[8] + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[7], @acc[7] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx @acc[7], $hi + adox $hi, @acc[7] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[8] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + adcx @acc[8], @acc[7] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + mov 8*3(%rsp), $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov @acc[1], %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adc \$0, @acc[6] + mov @acc[4], @acc[8] + + mov @acc[1], 8*0($b_ptr) + mov @acc[2], 8*1($b_ptr) + mov @acc[3], 8*2($b_ptr) + mov @acc[5], $lo + mov @acc[4], 8*3($b_ptr) + mov @acc[5], 8*4($b_ptr) + mov @acc[6], 8*5($b_ptr) + mov @acc[6], $hi + + ret +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +___ +} } } +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives ~10% better performance in add-chains +$code.=<<___; +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,\@function,4,"unwind" +.align 32 +sqrx_mont_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $r_ptr, 8*2(%rsp) + mov $a_ptr, 8*3(%rsp) + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($b_ptr) # ret->im + mov @acc[1], 8*7($b_ptr) + mov @acc[2], 8*8($b_ptr) + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32-128(%rsp), $a_ptr # t0 [+u-op density] + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + #lea -128($a_ptr), $a_ptr # control u-op density + #lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + lea 128($n_ptr), $n_ptr + mov 32+8*0(%rsp), @acc[6] + and @acc[11], @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[7] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[8] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[9] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($b_ptr) # ret->re + mov @acc[1], 8*1($b_ptr) + mov @acc[2], 8*2($b_ptr) + mov @acc[3], 8*3($b_ptr) + mov @acc[4], 8*4($b_ptr) + mov @acc[5], 8*5($b_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/crypto/blst_src/asm/sha256-armv8.pl b/crypto/blst_src/asm/sha256-armv8.pl new file mode 100755 index 00000000000..1de27c70667 --- /dev/null +++ b/crypto/blst_src/asm/sha256-armv8.pl @@ -0,0 +1,541 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for ARMv8. +# +# This module is stripped of scalar code paths, with raionale that all +# known processors are NEON-capable. +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$BITS=256; +$SZ=4; +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; +$reg_t="w"; +$pre="blst_"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +$code.=<<___; +.text + +.align 6 +.type .LK$BITS,%object +.LK$BITS: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +.size .LK$BITS,.-.LK$BITS +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm" +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.globl ${pre}sha256_block_armv8 +.type ${pre}sha256_block_armv8,%function +.align 6 +${pre}sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8 +___ +} + +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +.globl ${pre}sha256_block_data_order +.type ${pre}sha256_block_data_order,%function +.align 4 +${pre}sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + mov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + sub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + sub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #64 + csel $Xfer, $Xfer, xzr, eq + sub $inp,$inp,$Xfer // avoid SEGV + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + mov $Xfer,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order +___ +} + +{ +my ($out,$inp,$len) = map("x$_",(0..2)); + +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,%function +.align 4 +${pre}sha256_emit: + ldp x4,x5,[$inp] + ldp x6,x7,[$inp,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[$out,#4] + lsr x4,x4,#32 + str w5,[$out,#12] + lsr x5,x5,#32 + str w6,[$out,#20] + lsr x6,x6,#32 + str w7,[$out,#28] + lsr x7,x7,#32 + str w4,[$out,#0] + str w5,[$out,#8] + str w6,[$out,#16] + str w7,[$out,#24] + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,%function +.align 4 +${pre}sha256_bcopy: +.Loop_bcopy: + ldrb w3,[$inp],#1 + sub $len,$len,#1 + strb w3,[$out],#1 + cbnz $len,.Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,%function +.align 4 +${pre}sha256_hcopy: + ldp x4,x5,[$inp] + ldp x6,x7,[$inp,#16] + stp x4,x5,[$out] + stp x6,x7,[$out,#16] + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?64\b// and s/\.16b/\.2d/g or + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/\bext\b/ and s/\.2d/\.16b/g or + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/blst_src/asm/sha256-portable-x86_64.pl b/crypto/blst_src/asm/sha256-portable-x86_64.pl new file mode 100755 index 00000000000..eca0564ebe7 --- /dev/null +++ b/crypto/blst_src/asm/sha256-portable-x86_64.pl @@ -0,0 +1,337 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# Scalar-only version with minor twist minimizing 'lea' instructions. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$pre="blst_"; +$func="${pre}sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +sub ROUND_00_15() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + # $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); + +$code.=<<___; + ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 + mov $f,$a2 + + xor $e,$a0 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $g,$a2 # f^g + + mov $T1,`$SZ*($i&0xf)`(%rsp) + xor $a,$a1 + and $e,$a2 # (f^g)&e + + ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 + add $h,$T1 # T1+=h + xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g + + ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 + xor $e,$a0 + add $a2,$T1 # T1+=Ch(e,f,g) + + mov $a,$a2 + add `$SZ*$i`($Tbl),$T1 # T1+=K[round] + xor $a,$a1 + + xor $b,$a2 # a^b, b^c in next round + ror \$$Sigma1[0],$a0 # Sigma1(e) + mov $b,$h + + and $a2,$a3 + ror \$$Sigma0[0],$a1 # Sigma0(a) + add $a0,$T1 # T1+=Sigma1(e) + + xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + add $T1,$d # d+=T1 + add $T1,$h # h+=T1 +___ +$code.=<<___ if ($i==31); + lea `16*$SZ`($Tbl),$Tbl # round+=16 +___ +$code.=<<___ if ($i<15); + add $a1,$h # h+=Sigma0(a) +___ + ($a2,$a3) = ($a3,$a2); +} + +sub ROUND_16_XX() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 + + mov $a0,$T1 + ror \$`$sigma0[1]-$sigma0[0]`,$a0 + add $a1,$a # modulo-scheduled h+=Sigma0(a) + mov $a2,$a1 + ror \$`$sigma1[1]-$sigma1[0]`,$a2 + + xor $T1,$a0 + shr \$$sigma0[2],$T1 + ror \$$sigma0[0],$a0 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + + ror \$$sigma1[0],$a2 + xor $a0,$T1 # sigma0(X[(i+1)&0xf]) + xor $a1,$a2 # sigma1(X[(i+14)&0xf]) + add `$SZ*(($i+9)&0xf)`(%rsp),$T1 + + add `$SZ*($i&0xf)`(%rsp),$T1 + mov $e,$a0 + add $a2,$T1 + mov $a,$a1 +___ + &ROUND_00_15(@_); +} + +$code=<<___; +.text + +.globl $func +.type $func,\@function,3,"unwind" +.align 16 +$func: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_adjust_cfa_offset $framesz + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg +.cfi_end_prologue + + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H + jmp .Lloop + +.align 16 +.Lloop: + mov $B,$a3 + lea $TABLE(%rip),$Tbl + xor $C,$a3 # magic +___ + for($i=0;$i<16;$i++) { + $code.=" mov $SZ*$i($inp),$T1\n"; + $code.=" mov @ROT[4],$a0\n"; + $code.=" mov @ROT[0],$a1\n"; + $code.=" bswap $T1\n"; + &ROUND_00_15($i,@ROT); + unshift(@ROT,pop(@ROT)); + } +$code.=<<___; + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: +___ + for(;$i<32;$i++) { + &ROUND_16_XX($i,@ROT); + unshift(@ROT,pop(@ROT)); + } + +$code.=<<___; + cmpb \$0x19,`$SZ-1`($Tbl) + jnz .Lrounds_16_xx + + mov $_ctx,$ctx + add $a1,$A # modulo-scheduled h+=Sigma0(a) + lea 16*$SZ($inp),$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop + + lea $framesz+6*8(%rsp),%r11 +.cfi_def_cfa %r11,8 + mov $framesz(%rsp),%r15 +.cfi_restore %r15 + mov -40(%r11),%r14 +.cfi_restore %r14 + mov -32(%r11),%r13 +.cfi_restore %r13 + mov -24(%r11),%r12 +.cfi_restore %r12 + mov -16(%r11),%rbp +.cfi_restore %rbp + mov -8(%r11),%rbx +.cfi_restore %rbx +.cfi_epilogue + lea (%r11),%rsp + ret +.cfi_endproc +.size $func,.-$func + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +___ +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,\@abi-omnipotent +.align 16 +${pre}sha256_emit: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,\@abi-omnipotent +.align 16 +${pre}sha256_bcopy: + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,\@abi-omnipotent +.align 16 +${pre}sha256_hcopy: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + print $_,"\n"; +} +close STDOUT; diff --git a/crypto/blst_src/asm/sha256-x86_64.pl b/crypto/blst_src/asm/sha256-x86_64.pl new file mode 100755 index 00000000000..22b376318fa --- /dev/null +++ b/crypto/blst_src/asm/sha256-x86_64.pl @@ -0,0 +1,789 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# This module is stripped of AVX and even scalar code paths, with +# raionale that +# +# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one* +# processor, venerable Sandy Bridge; +# b) AVX2 incurs costly power transitions, which would be justifiable +# if AVX2 code was executing most of the time, which is not the +# case in the context; +# c) all comtemporary processors support SSSE3, so that nobody would +# actually use scalar code path anyway; +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$pre="blst_"; +$func="${pre}sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +$code=<<___; +.text + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +___ + +###################################################################### +# SIMD code paths +# +{{{ +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.globl ${pre}sha256_block_data_order_shaext +.hidden ${pre}sha256_block_data_order_shaext +.type ${pre}sha256_block_data_order_shaext,\@function,3,"unwind" +.align 64 +${pre}sha256_block_data_order_shaext: +.cfi_startproc +___ +$code.=<<___ if ($win64); + sub \$0x58,%rsp +.cfi_adjust_cfa_offset 0x58 + movaps %xmm6,-0x58(%r11) +.cfi_offset %xmm6,-0x60 + movaps %xmm7,-0x48(%r11) +.cfi_offset %xmm7,-0x50 + movaps %xmm8,-0x38(%r11) +.cfi_offset %xmm8,-0x40 + movaps %xmm9,-0x28(%r11) +.cfi_offset %xmm9,-0x30 + movaps %xmm10,-0x18(%r11) +.cfi_offset %xmm10,-0x20 +.cfi_end_prologue +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x100-0x80($Tbl),$TMP # byte swap mask + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi + nop + movdqa $ABEF,$ABEF_SAVE # offload + sha256rnds2 $CDGH,$ABEF + + movdqa 1*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi + lea 0x40($inp),$inp + sha256msg1 @MSG[1],@MSG[0] + sha256rnds2 $CDGH,$ABEF + + movdqa 2*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + nop + paddd $TMP,@MSG[0] + sha256msg1 @MSG[2],@MSG[1] + sha256rnds2 $CDGH,$ABEF + + movdqa 3*16-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + nop + paddd $TMP,@MSG[1] + sha256msg1 @MSG[3],@MSG[2] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $i*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + nop + paddd $TMP,@MSG[2] + sha256msg1 @MSG[0],@MSG[3] + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + sha256rnds2 $CDGH,$ABEF + paddd $TMP,@MSG[2] + + movdqa 14*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP + sha256rnds2 $CDGH,$ABEF + + movdqa 15*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + nop + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi + dec $num + nop + sha256rnds2 $CDGH,$ABEF + + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -0x58(%r11),%xmm6 + movaps -0x48(%r11),%xmm7 + movaps -0x38(%r11),%xmm8 + movaps -0x28(%r11),%xmm9 + movaps -0x18(%r11),%xmm10 + mov %r11,%rsp +.cfi_def_cfa %r11,8 +.cfi_epilogue +___ +$code.=<<___; + ret +.cfi_endproc +.size ${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext +___ +}}} +{{{ + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a2,$b)', # a^b, b^c in next round + + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +###################################################################### +# SSSE3 code path +# +{ +my $Tbl = $inp; +my $_ctx="0(%rbp)"; +my $_inp="8(%rbp)"; +my $_end="16(%rbp)"; +my $framesz=4*8+$win64*16*4+8; + +my @X = map("%xmm$_",(0..3)); +my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; +.globl ${func} +.hidden ${func} +.type ${func},\@function,3,"unwind" +.align 64 +${func}: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_adjust_cfa_offset $framesz + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,0(%rsp) # save ctx, 1st arg + #mov $inp,8(%rsp) # save inp, 2nd arg + mov %rdx,16(%rsp) # save end pointer, "3rd" arg +___ +$code.=<<___ if ($win64); + movaps %xmm6,0x20(%rsp) +.cfi_offset %xmm6,-0x78 + movaps %xmm7,0x30(%rsp) +.cfi_offset %xmm7,-0x68 + movaps %xmm8,0x40(%rsp) +.cfi_offset %xmm8,-0x58 + movaps %xmm9,0x50(%rsp) +.cfi_offset %xmm9,-0x48 +___ +$code.=<<___; + mov %rsp,%rbp +.cfi_def_cfa_register %rbp +.cfi_end_prologue + + lea -16*$SZ(%rsp),%rsp + mov $SZ*0($ctx),$A + and \$-64,%rsp # align stack + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + +$code.=<<___; + #movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5 + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + mov $inp,$_inp # offload $inp + movdqu 0x00($inp),@X[0] + movdqu 0x10($inp),@X[1] + movdqu 0x20($inp),@X[2] + pshufb $t3,@X[0] + movdqu 0x30($inp),@X[3] + lea $TABLE(%rip),$Tbl + pshufb $t3,@X[1] + movdqa 0x00($Tbl),$t0 + movdqa 0x10($Tbl),$t1 + pshufb $t3,@X[2] + paddd @X[0],$t0 + movdqa 0x20($Tbl),$t2 + pshufb $t3,@X[3] + movdqa 0x30($Tbl),$t3 + paddd @X[1],$t1 + paddd @X[2],$t2 + paddd @X[3],$t3 + movdqa $t0,0x00(%rsp) + mov $A,$a1 + movdqa $t1,0x10(%rsp) + mov $B,$a3 + movdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + movdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + sub \$`-16*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_SSSE3 () { + ( + '&movdqa ($t0,@X[1]);', + '&movdqa ($t3,@X[3])', + '&palignr ($t0,@X[0],$SZ)', # X[1..4] + '&palignr ($t3,@X[2],$SZ);', # X[9..12] + '&movdqa ($t1,$t0)', + '&movdqa ($t2,$t0);', + '&psrld ($t0,$sigma0[2])', + '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] + '&psrld ($t2,$sigma0[0])', + '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&pslld ($t1,8*$SZ-$sigma0[1]);'. + '&pxor ($t0,$t2)', + '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t1)', + '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t2);', + '&movdqa ($t2,$t3)', + '&pxor ($t0,$t1);', # sigma0(X[1..4]) + '&psrld ($t3,$sigma1[2])', + '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2)', + '&pshufb ($t3,$t4)', # sigma1(X[14..15]) + '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&movdqa ($t2,$t3);', + '&psrld ($t3,$sigma1[2])', + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2);', + '&movdqa ($t2,16*$j."($Tbl)")', + '&pshufb ($t3,$t5)', + '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub SSSE3_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + if (0) { + foreach (Xupdate_256_SSSE3()) { # 36 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } else { # squeeze extra 4% on Westmere and 19% on Atom + eval(shift(@insns)); #@ + &movdqa ($t0,@X[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &palignr ($t0,@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t1,$t0); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t0); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t2,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[3],0b11111010); # X[4..15] + eval(shift(@insns)); + eval(shift(@insns)); #@ + &pslld ($t1,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + &pxor ($t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,16*$j."($Tbl)"); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &pslldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + } + &paddd ($t2,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &movdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &SSSE3_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &jne (".Lssse3_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + mov $_inp,$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + lea 16*$SZ($inp),$inp + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_ssse3 + + xorps %xmm0, %xmm0 + lea $framesz+6*8(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0, 0x00(%rsp) # scrub the stack + movaps %xmm0, 0x10(%rsp) + movaps %xmm0, 0x20(%rsp) + movaps %xmm0, 0x30(%rsp) +___ +$code.=<<___ if ($win64); + movaps 0x20(%rbp),%xmm6 + movaps 0x30(%rbp),%xmm7 + movaps 0x40(%rbp),%xmm8 + movaps 0x50(%rbp),%xmm9 +___ +$code.=<<___; + mov $framesz(%rbp),%r15 +.cfi_restore %r15 + mov -40(%r11),%r14 +.cfi_restore %r14 + mov -32(%r11),%r13 +.cfi_restore %r13 + mov -24(%r11),%r12 +.cfi_restore %r12 + mov -16(%r11),%rbx +.cfi_restore %rbx + mov -8(%r11),%rbp +.cfi_restore %rbp +.cfi_epilogue + lea (%r11),%rsp + ret +.cfi_endproc +.size ${func},.-${func} +___ +} +}}} +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,\@abi-omnipotent +.align 16 +${pre}sha256_emit: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,\@abi-omnipotent +.align 16 +${pre}sha256_bcopy: + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,\@abi-omnipotent +.align 16 +${pre}sha256_hcopy: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x38); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/crypto/blst_src/asm/x86_64-xlate.pl b/crypto/blst_src/asm/x86_64-xlate.pl new file mode 100755 index 00000000000..62be619d9fc --- /dev/null +++ b/crypto/blst_src/asm/x86_64-xlate.pl @@ -0,0 +1,1781 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm. +# +# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T +# format is way easier to parse. Because it's simpler to "gear" from +# Unix ABI to Windows one [see cross-reference "card" at the end of +# file]. Because Linux targets were available first... +# +# In addition the script also "distills" code suitable for GNU +# assembler, so that it can be compiled with more rigid assemblers, +# such as Solaris /usr/ccs/bin/as. +# +# This translator is not designed to convert *arbitrary* assembler +# code from AT&T format to MASM one. It's designed to convert just +# enough to provide for dual-ABI OpenSSL modules development... +# There *are* limitations and you might have to modify your assembler +# code or this script to achieve the desired result... +# +# Currently recognized limitations: +# +# - can't use multiple ops per line; +# +# Dual-ABI styling rules. +# +# 1. Adhere to Unix register and stack layout [see cross-reference +# ABI "card" at the end for explanation]. +# 2. Forget about "red zone," stick to more traditional blended +# stack frame allocation. If volatile storage is actually required +# that is. If not, just leave the stack as is. +# 3. Functions tagged with ".type name,@function" get crafted with +# unified Win64 prologue and epilogue automatically. If you want +# to take care of ABI differences yourself, tag functions as +# ".type name,@abi-omnipotent" instead. +# 4. To optimize the Win64 prologue you can specify number of input +# arguments as ".type name,@function,N." Keep in mind that if N is +# larger than 6, then you *have to* write "abi-omnipotent" code, +# because >6 cases can't be addressed with unified prologue. +# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: +# (sorry about latter). +# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is +# required to identify the spots, where to inject Win64 epilogue! +# But on the pros, it's then prefixed with rep automatically:-) +# 7. Stick to explicit ip-relative addressing. If you have to use +# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. +# Both are recognized and translated to proper Win64 addressing +# modes. +# +# 8. In order to provide for structured exception handling unified +# Win64 prologue copies %rsp value to %rax. [Unless function is +# tagged with additional .type tag.] For further details see SEH +# paragraph at the end. +# 9. .init segment is allowed to contain calls to functions only. +# a. If function accepts more than 4 arguments *and* >4th argument +# is declared as non 64-bit value, do clear its upper part. + + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +open STDOUT,">$output" || die "can't open $output: $!" + if (defined($output)); + +my $gas=1; $gas=0 if ($output =~ /\.asm$/); +my $elf=1; $elf=0 if (!$gas); +my $dwarf=$elf; +my $win64=0; +my $prefix=""; +my $decor=".L"; + +my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 +my $masm=0; +my $PTR=" PTR"; + +my $nasmref=2.03; +my $nasm=0; + +if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; + $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`; + $prefix =~ s|\R$||; # Better chomp + } +elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } +elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } +elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } +elsif (!$gas) +{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) + { $nasm = $1 + $2*0.01; $PTR=""; } + elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) + { $masm = $1 + $2*2**-16 + $4*2**-32; } + die "no assembler found on %PATH%" if (!($nasm || $masm)); + $win64=1; + $elf=0; + $decor="\$L\$"; +} + +$dwarf=0 if($win64); + +my $current_segment; +my $current_function; +my %globals; + +{ package opcode; # pick up opcodes + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^([a-z][a-z0-9]*)/i) { + bless $self,$class; + $self->{op} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + undef $self->{sz}; + if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... + $self->{op} = $1; + $self->{sz} = $2; + } elsif ($self->{op} =~ /cmov[n]?[lb]$/) { + # pass through + } elsif ($self->{op} =~ /call|jmp/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn + $self->{sz} = ""; + } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov + $self->{sz} = ""; + } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { + $self->{op} = $1; + $self->{sz} = $2; + } + } + $ret; + } + sub size { + my ($self, $sz) = @_; + $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); + $self->{sz}; + } + sub out { + my $self = shift; + if ($gas) { + if ($self->{op} eq "movz") { # movz is pain... + sprintf "%s%s%s",$self->{op},$self->{sz},shift; + } elsif ($self->{op} =~ /^set/) { + "$self->{op}"; + } elsif ($self->{op} eq "ret") { + my $epilogue = ""; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $epilogue = "movq 8(%rsp),%rdi\n\t" . + "movq 16(%rsp),%rsi\n\t"; + } + $epilogue . ".byte 0xf3,0xc3"; + } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { + ".p2align\t3\n\t.quad"; + } else { + "$self->{op}$self->{sz}"; + } + } else { + $self->{op} =~ s/^movz/movzx/; + if ($self->{op} eq "ret") { + $self->{op} = ""; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". + "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; + } + $self->{op} .= "DB\t0F3h,0C3h\t\t;repret"; + } elsif ($self->{op} =~ /^(pop|push)f/) { + $self->{op} .= $self->{sz}; + } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { + $self->{op} = "\tDQ"; + } + $self->{op}; + } + } + sub mnemonic { + my ($self, $op) = @_; + $self->{op}=$op if (defined($op)); + $self->{op}; + } +} +{ package const; # pick up constants, which start with $ + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^\$([^,]+)/) { + bless $self, $class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub out { + my $self = shift; + + $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; + if ($gas) { + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{value} + my $value = $self->{value}; + no warnings; # oct might complain about overflow, ignore here... + $value =~ s/(?{value} = $value; + } + sprintf "\$%s",$self->{value}; + } else { + my $value = $self->{value}; + $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); + sprintf "%s",$value; + } + } +} +{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) + + my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", + l=>"DWORD$PTR", d=>"DWORD$PTR", + q=>"QWORD$PTR", o=>"OWORD$PTR", + x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", + z=>"ZMMWORD$PTR" ) if (!$gas); + + my %sifmap = ( ss=>"d", sd=>"q", # broadcast only + i32x2=>"q", f32x2=>"q", + i32x4=>"x", i64x2=>"x", i128=>"x", + f32x4=>"x", f64x2=>"x", f128=>"x", + i32x8=>"y", i64x4=>"y", + f32x8=>"y", f64x4=>"y" ) if (!$gas); + + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) { + bless $self, $class; + $self->{asterisk} = $1; + $self->{label} = $2; + ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); + $self->{scale} = 1 if (!defined($self->{scale})); + $self->{opmask} = $4; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { + die if ($opcode->mnemonic() ne "mov"); + $opcode->mnemonic("lea"); + } + $self->{base} =~ s/^%//; + $self->{index} =~ s/^%// if (defined($self->{index})); + $self->{opcode} = $opcode; + } + $ret; + } + sub size {} + sub out { + my ($self, $sz) = @_; + + $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{label} =~ s/\.L/$decor/g; + + # Silently convert all EAs to 64-bit. This is required for + # elder GNU assembler and results in more compact code, + # *but* most importantly AES module depends on this feature! + $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{label}... + use integer; + $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; + + # Some assemblers insist on signed presentation of 32-bit + # offsets, but sign extension is a tricky business in perl... + $self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg; + + # if base register is %rbp or %r13, see if it's possible to + # flip base and index registers [for better performance] + if (!$self->{label} && $self->{index} && $self->{scale}==1 && + $self->{base} =~ /(rbp|r13)/) { + $self->{base} = $self->{index}; $self->{index} = $1; + } + + if ($gas) { + $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); + + if (defined($self->{index})) { + sprintf "%s%s(%s,%%%s,%d)%s", + $self->{asterisk},$self->{label}, + $self->{base}?"%$self->{base}":"", + $self->{index},$self->{scale}, + $self->{opmask}; + } else { + sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, + $self->{base},$self->{opmask}; + } + } else { + $self->{label} =~ s/\./\$/g; + $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); + + my $mnemonic = $self->{opcode}->mnemonic(); + ($self->{asterisk}) && ($sz="q") || + ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || + ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/) + && ($sz=$sifmap{$1}); + + $self->{opmask} =~ s/%(k[0-7])/$1/; + + if (defined($self->{index})) { + sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{index},$self->{scale}, + $self->{base}?"+$self->{base}":"", + $self->{opmask}; + } elsif ($self->{base} eq "rip") { + sprintf "%s[%s]",$szmap{$sz},$self->{label}; + } else { + sprintf "%s[%s%s]%s", $szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{base},$self->{opmask}; + } + } + } +} +{ package register; # pick up registers, which start with %. + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { + bless $self,$class; + $self->{asterisk} = $1; + $self->{value} = $2; + $self->{opmask} = $3; + $opcode->size($self->size()); + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub size { + my $self = shift; + my $ret; + + if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } + elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } + elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } + elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } + elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } + elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } + + $ret; + } + sub out { + my $self = shift; + if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, + $self->{value}, + $self->{opmask}; } + else { $self->{opmask} =~ s/%(k[0-7])/$1/; + $self->{value}.$self->{opmask}; } + } +} +{ package label; # pick up labels, which end with : + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[\.\w]+)\:/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/^\.L/$decor/; + } + $ret; + } + sub out { + my $self = shift; + + if ($gas) { + my $func = ($globals{$self->{value}} or $self->{value}) . ":"; + if ($current_function->{name} eq $self->{value}) { + $func .= "\n.cfi_".cfi_directive::startproc() if ($dwarf); + $func .= "\n .byte 0xf3,0x0f,0x1e,0xfa\n"; # endbranch + if ($win64 && $current_function->{abi} eq "svr4") { + my $fp = $current_function->{unwind} ? "%r11" : "%rax"; + $func .= " movq %rdi,8(%rsp)\n"; + $func .= " movq %rsi,16(%rsp)\n"; + $func .= " movq %rsp,$fp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " movq %rcx,%rdi\n" if ($narg>0); + $func .= " movq %rdx,%rsi\n" if ($narg>1); + $func .= " movq %r8,%rdx\n" if ($narg>2); + $func .= " movq %r9,%rcx\n" if ($narg>3); + $func .= " movq 40(%rsp),%r8\n" if ($narg>4); + $func .= " movq 48(%rsp),%r9\n" if ($narg>5); + } + } + $func; + } elsif ($self->{value} ne "$current_function->{name}") { + # Make all labels in masm global. + $self->{value} .= ":" if ($masm); + $self->{value} . ":"; + } elsif ($win64 && $current_function->{abi} eq "svr4") { + my $func = "$current_function->{name}" . + ($nasm ? ":" : "\tPROC $current_function->{scope}") . + "\n"; + my $fp = $current_function->{unwind} ? "r11" : "rax"; + $func .= " DB 243,15,30,250\n"; # endbranch + $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; + $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; + $func .= " mov $fp,rsp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:"; + $func .= ":" if ($masm); + $func .= "\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " mov rdi,rcx\n" if ($narg>0); + $func .= " mov rsi,rdx\n" if ($narg>1); + $func .= " mov rdx,r8\n" if ($narg>2); + $func .= " mov rcx,r9\n" if ($narg>3); + $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4); + $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5); + $func .= "\n"; + } else { + "$current_function->{name}". + ($nasm ? ":" : "\tPROC $current_function->{scope}"). + "\n DB 243,15,30,250"; # endbranch + } + } +} +{ package expr; # pick up expressions + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[^,]+)/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/\@PLT// if (!$elf); + $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{value} =~ s/\.L/$decor/g; + $self->{opcode} = $opcode; + } + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +my @xdata_seg = (".section .xdata", ".align 8"); +my @pdata_seg = (".section .pdata", ".align 4"); + +{ package cfi_directive; + # CFI directives annotate instructions that are significant for + # stack unwinding procedure compliant with DWARF specification, + # see http://dwarfstd.org/. Besides naturally expected for this + # script platform-specific filtering function, this module adds + # three auxiliary synthetic directives not recognized by [GNU] + # assembler: + # + # - .cfi_push to annotate push instructions in prologue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_offset; + # - .cfi_pop to annotate pop instructions in epilogue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_restore; + # - [and most notably] .cfi_cfa_expression which encodes + # DW_CFA_def_cfa_expression and passes it to .cfi_escape as + # byte vector; + # + # CFA expressions were introduced in DWARF specification version + # 3 and describe how to deduce CFA, Canonical Frame Address. This + # becomes handy if your stack frame is variable and you can't + # spare register for [previous] frame pointer. Suggested directive + # syntax is made-up mix of DWARF operator suffixes [subset of] + # and references to registers with optional bias. Following example + # describes offloaded *original* stack pointer at specific offset + # from *current* stack pointer: + # + # .cfi_cfa_expression %rsp+40,deref,+8 + # + # Final +8 has everything to do with the fact that CFA is defined + # as reference to top of caller's stack, and on x86_64 call to + # subroutine pushes 8-byte return address. In other words original + # stack pointer upon entry to a subroutine is 8 bytes off from CFA. + # + # In addition the .cfi directives are re-purposed even for Win64 + # stack unwinding. Two more synthetic directives were added: + # + # - .cfi_end_prologue to denote point when all non-volatile + # registers are saved and stack or [chosen] frame pointer is + # stable; + # - .cfi_epilogue to denote point when all non-volatile registers + # are restored [and it even adds missing .cfi_restore-s]; + # + # Though it's not universal "miracle cure," it has its limitations. + # Most notably .cfi_cfa_expression won't start working... For more + # information see the end of this file. + + # Below constants are taken from "DWARF Expressions" section of the + # DWARF specification, section is numbered 7.7 in versions 3 and 4. + my %DW_OP_simple = ( # no-arg operators, mapped directly + deref => 0x06, dup => 0x12, + drop => 0x13, over => 0x14, + pick => 0x15, swap => 0x16, + rot => 0x17, xderef => 0x18, + + abs => 0x19, and => 0x1a, + div => 0x1b, minus => 0x1c, + mod => 0x1d, mul => 0x1e, + neg => 0x1f, not => 0x20, + or => 0x21, plus => 0x22, + shl => 0x24, shr => 0x25, + shra => 0x26, xor => 0x27, + ); + + my %DW_OP_complex = ( # used in specific subroutines + constu => 0x10, # uleb128 + consts => 0x11, # sleb128 + plus_uconst => 0x23, # uleb128 + lit0 => 0x30, # add 0-31 to opcode + reg0 => 0x50, # add 0-31 to opcode + breg0 => 0x70, # add 0-31 to opcole, sleb128 + regx => 0x90, # uleb28 + fbreg => 0x91, # sleb128 + bregx => 0x92, # uleb128, sleb128 + piece => 0x93, # uleb128 + ); + + # Following constants are defined in x86_64 ABI supplement, for + # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, + # see section 3.7 "Stack Unwind Algorithm". + my %DW_reg_idx = ( + "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, + "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + + my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs); + my @cfa_stack; + + # [us]leb128 format is variable-length integer representation base + # 2^128, with most significant bit of each byte being 0 denoting + # *last* most significant digit. See "Variable Length Data" in the + # DWARF specification, numbered 7.6 at least in versions 3 and 4. + sub sleb128 { + use integer; # get right shift extend sign + + my $val = shift; + my $sign = ($val < 0) ? -1 : 0; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if remaining bits are same and equal to most + # significant bit of the current digit, if so, it's + # last digit... + last if (($val>>6) == $sign); + + @ret[-1] |= 0x80; + $val >>= 7; + } + + return @ret; + } + sub uleb128 { + my $val = shift; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if it's last significant digit... + last if (($val >>= 7) == 0); + + @ret[-1] |= 0x80; + } + + return @ret; + } + sub const { + my $val = shift; + + if ($val >= 0 && $val < 32) { + return ($DW_OP_complex{lit0}+$val); + } + return ($DW_OP_complex{consts}, sleb128($val)); + } + sub reg { + my $val = shift; + + return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); + + my $reg = $DW_reg_idx{$1}; + my $off = eval ("0 $2 $3"); + + return (($DW_OP_complex{breg0} + $reg), sleb128($off)); + # Yes, we use DW_OP_bregX+0 to push register value and not + # DW_OP_regX, because latter would require even DW_OP_piece, + # which would be a waste under the circumstances. If you have + # to use DWP_OP_reg, use "regx:N"... + } + sub cfa_expression { + my $line = shift; + my @ret; + + foreach my $token (split(/,\s*/,$line)) { + if ($token =~ /^%r/) { + push @ret,reg($token); + } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { + push @ret,reg("$2+$1"); + } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { + my $i = 1*eval($2); + push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); + } elsif (my $i = 1*eval($token) or $token eq "0") { + if ($token =~ /^\+/) { + push @ret,$DW_OP_complex{plus_uconst},uleb128($i); + } else { + push @ret,const($i); + } + } else { + push @ret,$DW_OP_simple{$token}; + } + } + + # Finally we return DW_CFA_def_cfa_expression, 15, followed by + # length of the expression and of course the expression itself. + return (15,scalar(@ret),@ret); + } + + # Following constants are defined in "x64 exception handling" at + # https://docs.microsoft.com/ and match the register sequence in + # CONTEXT structure defined in winnt.h. + my %WIN64_reg_idx = ( + "%rax"=>0, "%rcx"=>1, "%rdx"=>2, "%rbx"=>3, + "%rsp"=>4, "%rbp"=>5, "%rsi"=>6, "%rdi"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + sub xdata { + our @dat = (); + our $len = 0; + + sub allocstack { + my $offset = shift; + + if ($offset) { + if ($offset <= 128) { + $offset = ($offset - 8) >> 3; + push @dat, [0,$offset<<4|2]; # UWOP_ALLOC_SMALL + } elsif ($offset < 0x80000) { + push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,0x11,unpack("C4",pack("V",$offset))]; + } + $len += $#{@dat[-1]}+1; + } + } + + # allocate stack frame + if (my $offset = -8 - $cfa_rsp) { + # but see if frame pointer is among saved registers + if ($cfa_reg ne "%rsp" and my $fp_off = $saved_regs{$cfa_reg}) { + $fp_off = -8 - $fp_off; + allocstack($fp_off-8); + $offset -= $fp_off; + push @dat, [0,$WIN64_reg_idx{$cfa_reg}<<4]; # UWOP_PUSH_NONVOL + $len += $#{@dat[-1]}+1; + } + allocstack($offset); + } + # set up frame pointer + my $fp_info = 0; + if ($cfa_reg ne "%rsp") { + my $offset = $cfa_off - $cfa_rsp; + ($offset > 240 or $offset&0xf) and die "invalid FP offset $offset"; + $fp_info = ($offset&-16)|$WIN64_reg_idx{$cfa_reg}; + push @dat, [0,3]; # UWOP_SET_FPREG + $len += $#{@dat[-1]}+1; + } + # save registers + foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} } + keys(%saved_regs)) { + next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key); + my $offset = $saved_regs{$key} - $cfa_rsp; + if ($key =~ /%xmm([0-9]+)/) { + if ($offset < 0x100000) { + push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))]; + } else { + push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))]; + } + } else { + if ($offset < 0x80000) { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4, + unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5, + unpack("C4",pack("V",$offset))]; + } + } + $len += $#{@dat[-1]}+1; + } + + my @ret; + # generate 4-byte descriptor + push @ret, ".byte 1,0,".($len/2).",$fp_info"; + $len += 4; + # pad to 8*n + unshift @dat, [(0)x((-$len)&7)] if ($len&7); + # emit data + while(defined(my $row = pop @dat)) { + push @ret, ".byte ". join(",", + map { sprintf "0x%02x",$_ } @{$row}); + } + + return @ret; + } + sub startproc { + return if ($cfa_rsp == -8); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8); + %saved_regs = (); + return "startproc"; + } + sub endproc { + return if ($cfa_rsp == 0); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0); + %saved_regs = (); + return "endproc"; + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { + bless $self,$class; + $ret = $self; + undef $self->{value}; + my $dir = $1; + + SWITCH: for ($dir) { + # What is $cfa_rsp? Effectively it's difference between %rsp + # value and current CFA, Canonical Frame Address, which is + # why it starts with -8. Recall that CFA is top of caller's + # stack... + /startproc/ && do { $dir = startproc(); last; }; + /endproc/ && do { $dir = endproc(); + # .cfi_remember_state directives that are not + # matched with .cfi_restore_state are + # unnecessary. + die "unpaired .cfi_remember_state" if (@cfa_stack); + last; + }; + /def_cfa_register/ + && do { $cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp"); + $cfa_reg = $$line; + last; + }; + /def_cfa_offset/ + && do { $cfa_off = -1*eval($$line); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + last; + }; + /adjust_cfa_offset/ + && do { my $val = 1*eval($$line); + $cfa_off -= $val; + if ($cfa_reg eq "%rsp") { + $cfa_rsp -= $val; + } + last; + }; + /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { + $cfa_reg = $1; + $cfa_off = -1*eval($2); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + } + last; + }; + /push/ && do { $dir = undef; + $cfa_rsp -= 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; + } + $saved_regs{$$line} = $cfa_rsp; + $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; + last; + }; + /pop/ && do { $dir = undef; + $cfa_rsp += 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; + } + $self->{value} .= ".cfi_restore\t$$line"; + delete $saved_regs{$$line}; + last; + }; + /cfa_expression/ + && do { $dir = undef; + $self->{value} = ".cfi_escape\t" . + join(",", map(sprintf("0x%02x", $_), + cfa_expression($$line))); + last; + }; + /remember_state/ + && do { push @cfa_stack, + [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs]; + last; + }; + /restore_state/ + && do { ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs) + = @{pop @cfa_stack}; + last; + }; + /offset/ && do { if ($$line =~ /(%\w+)\s*,\s*(.+)/) { + $saved_regs{$1} = 1*eval($2); + $dir = undef if ($1 =~ /%xmm/); + } + last; + }; + /restore/ && do { delete $saved_regs{$$line}; last; }; + /end_prologue/ + && do { $dir = undef; + $self->{win64} = ".endprolog"; + last; + }; + /epilogue/ && do { $dir = undef; + $self->{win64} = ".epilogue"; + $self->{value} = join("\n", + map { ".cfi_restore\t$_" } + sort keys(%saved_regs)); + %saved_regs = (); + last; + }; + } + + $self->{value} = ".cfi_$dir\t$$line" if ($dir); + + $$line = ""; + } + + return $ret; + } + sub out { + my $self = shift; + return $self->{value} if ($dwarf); + + if ($win64 and $current_function->{unwind} + and my $ret = $self->{win64}) { + my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/) ? ($', $cfa_off) + : ("rsp", $cfa_rsp); + my $fname = $current_function->{name}; + + if ($ret eq ".endprolog") { + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + + push @pdata_seg, + ".rva .LSEH_begin_${fname}", + ".rva .LSEH_body_${fname}", + ".rva .LSEH_info_${fname}_prologue",""; + push @xdata_seg, + ".LSEH_info_${fname}_prologue:", + ".byte 1,0,5,0x0b", # 5 unwind codes, %r11 is FP + ".byte 0,0x74,1,0", # %rdi at 8(%rsp) + ".byte 0,0x64,2,0", # %rsi at 16(%rsp) + ".byte 0,0x03", # set frame pointer + ".byte 0,0" # padding + ; + push @pdata_seg, + ".rva .LSEH_body_${fname}", + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_info_${fname}_body",""; + push @xdata_seg,".LSEH_info_${fname}_body:", xdata(); + $ret = "${decor}SEH_body_${fname}:"; + $ret .= ":" if ($masm); $ret .= "\n"; + } elsif ($ret eq ".epilogue") { + %saved_regs = (); + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + $cfa_rsp = $cfa_off; + + push @pdata_seg, + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_end_${fname}", + ".rva .LSEH_info_${fname}_epilogue",""; + push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), ""; + $ret = "${decor}SEH_epilogue_${fname}:"; + $ret .= ":" if ($masm); $ret .= "\n"; + if ($gas) { + $ret .= " mov ".(0-$off)."(%$reg),%rdi\n"; + $ret .= " mov ".(8-$off)."(%$reg),%rsi\n"; + } else { + $ret .= " mov rdi,QWORD$PTR\[".(0-$off)."+$reg\]"; + $ret .= " ;WIN64 epilogue\n"; + $ret .= " mov rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n"; + } + } + return $ret; + } + return; + } +} +{ package directive; # pick up directives, which start with . + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + my $dir; + + # chain-call to cfi_directive + $ret = cfi_directive->re($line) and return $ret; + + if ($$line =~ /^\s*(\.\w+)/) { + bless $self,$class; + $dir = $1; + $ret = $self; + undef $self->{value}; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + SWITCH: for ($dir) { + /\.global|\.globl|\.extern/ + && do { $globals{$$line} = $prefix . $$line; + $$line = $globals{$$line} if ($prefix); + last; + }; + /\.type/ && do { my ($sym,$type,$narg,$unwind) = split(',',$$line); + if ($type eq "\@function") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{abi} = "svr4"; + $current_function->{narg} = $narg; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + $current_function->{unwind} = $unwind; + } elsif ($type eq "\@abi-omnipotent") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + } + $$line =~ s/\@abi\-omnipotent/\@function/; + $$line =~ s/\@function.*/\@function/; + last; + }; + /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { + $dir = ".byte"; + $$line = join(",",unpack("C*",$1),0); + } + last; + }; + /\.rva|\.long|\.quad/ + && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $$line =~ s/\.L/$decor/g; + last; + }; + } + + if ($gas) { + $self->{value} = $dir . "\t" . $$line; + + if ($dir =~ /\.extern/) { + $self->{value} = ""; # swallow extern + } elsif (!$elf && $dir =~ /\.type/) { + $self->{value} = ""; + $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . + (defined($globals{$1})?".scl 2;":".scl 3;") . + "\t.type 32;\t.endef" + if ($win64 && $$line =~ /([^,]+),\@function/); + } elsif ($dir =~ /\.size/) { + $self->{value} = "" if (!$elf); + if ($dwarf and my $endproc = cfi_directive::endproc()) { + $self->{value} = ".cfi_$endproc\n$self->{value}"; + } elsif (!$elf && defined($current_function)) { + $self->{value} .= "${decor}SEH_end_$current_function->{name}:" + if ($win64 && $current_function->{abi} eq "svr4"); + undef $current_function; + } + } elsif (!$elf && $dir =~ /\.align/) { + $self->{value} = ".p2align\t" . (log($$line)/log(2)); + } elsif ($dir eq ".section") { + $current_segment=$$line; + if (!$elf && $current_segment eq ".init") { + if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } + elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } + } + } elsif ($dir =~ /\.(text|data)/) { + $current_segment=".$1"; + } elsif ($dir =~ /\.hidden/) { + if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } + elsif ($flavour eq "mingw64") { $self->{value} = ""; } + } elsif ($dir =~ /\.comm/) { + $self->{value} = "$dir\t$prefix$$line"; + $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); + } + $$line = ""; + return $self; + } + + # non-gas case or nasm/masm + SWITCH: for ($dir) { + /\.text/ && do { my $v=undef; + if ($nasm) { + $v="section .text code align=64\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = ".text\$"; + $v.="$current_segment\tSEGMENT "; + $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; + $v.=" 'CODE'"; + } + $self->{value} = $v; + last; + }; + /\.data/ && do { my $v=undef; + if ($nasm) { + $v="section .data data align=8\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT"; + } + $self->{value} = $v; + last; + }; + /\.section/ && do { my $v=undef; + $$line =~ s/([^,]*).*/$1/; + $$line = ".CRT\$XCU" if ($$line eq ".init"); + if ($nasm) { + $v="section $$line"; + if ($$line=~/\.([px])data/) { + $v.=" rdata align="; + $v.=$1 eq "p"? 4 : 8; + } elsif ($$line=~/\.CRT\$/i) { + $v.=" rdata align=8"; + } + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $v.="$$line\tSEGMENT"; + if ($$line=~/\.([px])data/) { + $v.=" READONLY"; + $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); + } elsif ($$line=~/\.CRT\$/i) { + $v.=" READONLY "; + $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; + } + } + $current_segment = $$line; + $self->{value} = $v; + last; + }; + /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; + $self->{value} .= ":NEAR" if ($masm); + last; + }; + /\.globl|.global/ + && do { $self->{value} = $masm?"PUBLIC":"global"; + $self->{value} .= "\t".$$line; + last; + }; + /\.size/ && do { if (defined($current_function)) { + undef $self->{value}; + if ($current_function->{abi} eq "svr4") { + $self->{value}="${decor}SEH_end_$current_function->{name}:"; + $self->{value}.=":\n" if($masm); + } + $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); + undef $current_function; + } + last; + }; + /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; + $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); + last; + }; + /\.(value|long|rva|quad)/ + && do { my $sz = substr($1,0,1); + my @arr = split(/,\s*/,$$line); + my $last = pop(@arr); + my $conv = sub { my $var=shift; + $var=~s/^(0b[0-1]+)/oct($1)/eig; + $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); + if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) + { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } + $var; + }; + + $sz =~ tr/bvlrq/BWDDQ/; + $self->{value} = "\tD$sz\t"; + for (@arr) { $self->{value} .= &$conv($_).","; } + $self->{value} .= &$conv($last); + last; + }; + /\.byte/ && do { my @str=split(/,\s*/,$$line); + map(s/(0b[0-1]+)/oct($1)/eig,@str); + map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); + while ($#str>15) { + $self->{value}.="DB\t" + .join(",",@str[0..15])."\n"; + foreach (0..15) { shift @str; } + } + $self->{value}.="DB\t" + .join(",",@str) if (@str); + last; + }; + /\.comm/ && do { my @str=split(/,\s*/,$$line); + my $v=undef; + if ($nasm) { + $v.="common $prefix@str[0] @str[1]"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT\n"; + $v.="COMM @str[0]:DWORD:".@str[1]/4; + } + $self->{value} = $v; + last; + }; + } + $$line = ""; + } + + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +# Upon initial x86_64 introduction SSE>2 extensions were not introduced +# yet. In order not to be bothered by tracing exact assembler versions, +# but at the same time to provide a bare security minimum of AES-NI, we +# hard-code some instructions. Extensions past AES-NI on the other hand +# are traced by examining assembler version in individual perlasm +# modules... + +my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, + "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); + +sub rex { + my $opcode=shift; + my ($dst,$src,$rex)=@_; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @$opcode,($rex|0x40) if ($rex); +} + +my $movq = sub { # elderly gas can't handle inter-register movq + my $arg = shift; + my @opcode=(0x66); + if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { + my ($src,$dst)=($1,$2); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x7e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { + my ($src,$dst)=($2,$1); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x6e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } else { + (); + } +}; + +my $pextrd = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } + elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } + rex(\@opcode,$src,$dst); + push @opcode,0x0f,0x3a,0x16; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pinsrd = sub { + if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($src =~ /%r([0-9]+)/) { $src = $1; } + elsif ($src =~ /%e/) { $src = $regrm{$src}; } + rex(\@opcode,$dst,$src); + push @opcode,0x0f,0x3a,0x22; + push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pshufb = sub { + if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$2,$1); + push @opcode,0x0f,0x38,0x00; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + @opcode; + } else { + (); + } +}; + +my $palignr = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x0f; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$1; + @opcode; + } else { + (); + } +}; + +my $pclmulqdq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $rdrand = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf0|($dst&7); + @opcode; + } else { + (); + } +}; + +my $rdseed = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf8|($dst&7); + @opcode; + } else { + (); + } +}; + +# Not all AVX-capable assemblers recognize AMD XOP extension. Since we +# are using only two instructions hand-code them in order to be excused +# from chasing assembler versions... + +sub rxb { + my $opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @$opcode,$rxb; +} + +my $vprotd = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $vprotq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc3; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +# Intel Control-flow Enforcement Technology extension. All functions and +# indirect branch targets will have to start with this instruction... +# However, it should not be used in functions' prologues explicitly, as +# it's added automatically [and in the right spot]. Which leaves only +# non-function indirect branch targets, such as in a case-like dispatch +# table, as application area. + +my $endbr64 = sub { + (0xf3,0x0f,0x1e,0xfa); +}; + +######################################################################## + +if ($nasm) { + print <<___; +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +___ +} elsif ($masm) { + print <<___; +OPTION DOTNAME +___ +} + +sub process { + my $line = shift; + + $line =~ s|\R$||; # Better chomp + + $line =~ s|[#!].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning + $line =~ s|\s+$||; # ... and at the end + + if (my $label=label->re(\$line)) { print $label->out(); } + + if (my $directive=directive->re(\$line)) { + printf "%s",$directive->out(); + } elsif (my $opcode=opcode->re(\$line)) { + my $asm = eval("\$".$opcode->mnemonic()); + + if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { + print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; + next; + } + + my @args; + ARGUMENT: while (1) { + my $arg; + + ($arg=register->re(\$line, $opcode))|| + ($arg=const->re(\$line)) || + ($arg=ea->re(\$line, $opcode)) || + ($arg=expr->re(\$line, $opcode)) || + last ARGUMENT; + + push @args,$arg; + + last ARGUMENT if ($line !~ /^,/); + + $line =~ s/^,\s*//; + } # ARGUMENT: + + if ($#args>=0) { + my $insn; + my $sz=$opcode->size(); + + if ($gas) { + $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); + @args = map($_->out($sz),@args); + printf "\t%s\t%s",$insn,join(",",@args); + } else { + $insn = $opcode->out(); + foreach (@args) { + my $arg = $_->out(); + # $insn.=$sz compensates for movq, pinsrw, ... + if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } + if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } + if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } + if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } + } + @args = reverse(@args); + undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); + printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); + } + } else { + printf "\t%s",$opcode->out(); + } + } + + print $line,"\n"; +} + +while(<>) { process($_); } + +map { process($_) } @pdata_seg if ($win64); +map { process($_) } @xdata_seg if ($win64); + +# platform-specific epilogue +if ($masm) { + print "\n$current_segment\tENDS\n" if ($current_segment); + print "END\n"; +} elsif ($elf) { + # -fcf-protection segment, snatched from compiler -S output + my $align = ($flavour =~ /elf32/) ? 4 : 8; + print <<___; + +.section .note.GNU-stack,"",\@progbits +.section .note.gnu.property,"a",\@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align $align +2: +___ +} + +close STDOUT; + + ################################################# +# Cross-reference x86_64 ABI "card" +# +# Unix Win64 +# %rax * * +# %rbx - - +# %rcx #4 #1 +# %rdx #3 #2 +# %rsi #2 - +# %rdi #1 - +# %rbp - - +# %rsp - - +# %r8 #5 #3 +# %r9 #6 #4 +# %r10 * * +# %r11 * * +# %r12 - - +# %r13 - - +# %r14 - - +# %r15 - - +# +# (*) volatile register +# (-) preserved by callee +# (#) Nth argument, volatile +# +# In Unix terms top of stack is argument transfer area for arguments +# which could not be accommodated in registers. Or in other words 7th +# [integer] argument resides at 8(%rsp) upon function entry point. +# 128 bytes above %rsp constitute a "red zone" which is not touched +# by signal handlers and can be used as temporal storage without +# allocating a frame. +# +# In Win64 terms N*8 bytes on top of stack is argument transfer area, +# which belongs to/can be overwritten by callee. N is the number of +# arguments passed to callee, *but* not less than 4! This means that +# upon function entry point 5th argument resides at 40(%rsp), as well +# as that 32 bytes from 8(%rsp) can always be used as temporal +# storage [without allocating a frame]. One can actually argue that +# one can assume a "red zone" above stack pointer under Win64 as well. +# Point is that at apparently no occasion Windows kernel would alter +# the area above user stack pointer in true asynchronous manner... +# +# All the above means that if assembler programmer adheres to Unix +# register and stack layout, but disregards the "red zone" existence, +# it's possible to use following prologue and epilogue to "gear" from +# Unix to Win64 ABI in leaf functions with not more than 6 arguments. +# +# omnipotent_function: +# ifdef WIN64 +# movq %rdi,8(%rsp) +# movq %rsi,16(%rsp) +# movq %rcx,%rdi ; if 1st argument is actually present +# movq %rdx,%rsi ; if 2nd argument is actually ... +# movq %r8,%rdx ; if 3rd argument is ... +# movq %r9,%rcx ; if 4th argument ... +# movq 40(%rsp),%r8 ; if 5th ... +# movq 48(%rsp),%r9 ; if 6th ... +# endif +# ... +# ifdef WIN64 +# movq 8(%rsp),%rdi +# movq 16(%rsp),%rsi +# endif +# ret +# + ################################################# +# Win64 SEH, Structured Exception Handling. +# +# Unlike on Unix systems(*) lack of Win64 stack unwinding information +# has undesired side-effect at run-time: if an exception is raised in +# assembler subroutine such as those in question (basically we're +# referring to segmentation violations caused by malformed input +# parameters), the application is briskly terminated without invoking +# any exception handlers, most notably without generating memory dump +# or any user notification whatsoever. This poses a problem. It's +# possible to address it by registering custom language-specific +# handler that would restore processor context to the state at +# subroutine entry point and return "exception is not handled, keep +# unwinding" code. Writing such handler can be a challenge... But it's +# doable, though requires certain coding convention. Consider following +# snippet: +# +# .type function,@function +# function: +# movq %rsp,%rax # copy rsp to volatile register +# pushq %r15 # save non-volatile registers +# pushq %rbx +# pushq %rbp +# movq %rsp,%r11 +# subq %rdi,%r11 # prepare [variable] stack frame +# andq $-64,%r11 +# movq %rax,0(%r11) # check for exceptions +# movq %r11,%rsp # allocate [variable] stack frame +# movq %rax,0(%rsp) # save original rsp value +# magic_point: +# ... +# movq 0(%rsp),%rcx # pull original rsp value +# movq -24(%rcx),%rbp # restore non-volatile registers +# movq -16(%rcx),%rbx +# movq -8(%rcx),%r15 +# movq %rcx,%rsp # restore original rsp +# magic_epilogue: +# ret +# .size function,.-function +# +# The key is that up to magic_point copy of original rsp value remains +# in chosen volatile register and no non-volatile register, except for +# rsp, is modified. While past magic_point rsp remains constant till +# the very end of the function. In this case custom language-specific +# exception handler would look like this: +# +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +# { ULONG64 *rsp = (ULONG64 *)context->Rax; +# ULONG64 rip = context->Rip; +# +# if (rip >= magic_point) +# { rsp = (ULONG64 *)context->Rsp; +# if (rip < magic_epilogue) +# { rsp = (ULONG64 *)rsp[0]; +# context->Rbp = rsp[-3]; +# context->Rbx = rsp[-2]; +# context->R15 = rsp[-1]; +# } +# } +# context->Rsp = (ULONG64)rsp; +# context->Rdi = rsp[1]; +# context->Rsi = rsp[2]; +# +# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); +# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, +# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, +# &disp->HandlerData,&disp->EstablisherFrame,NULL); +# return ExceptionContinueSearch; +# } +# +# It's appropriate to implement this handler in assembler, directly in +# function's module. In order to do that one has to know members' +# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant +# values. Here they are: +# +# CONTEXT.Rax 120 +# CONTEXT.Rcx 128 +# CONTEXT.Rdx 136 +# CONTEXT.Rbx 144 +# CONTEXT.Rsp 152 +# CONTEXT.Rbp 160 +# CONTEXT.Rsi 168 +# CONTEXT.Rdi 176 +# CONTEXT.R8 184 +# CONTEXT.R9 192 +# CONTEXT.R10 200 +# CONTEXT.R11 208 +# CONTEXT.R12 216 +# CONTEXT.R13 224 +# CONTEXT.R14 232 +# CONTEXT.R15 240 +# CONTEXT.Rip 248 +# CONTEXT.Xmm6 512 +# sizeof(CONTEXT) 1232 +# DISPATCHER_CONTEXT.ControlPc 0 +# DISPATCHER_CONTEXT.ImageBase 8 +# DISPATCHER_CONTEXT.FunctionEntry 16 +# DISPATCHER_CONTEXT.EstablisherFrame 24 +# DISPATCHER_CONTEXT.TargetIp 32 +# DISPATCHER_CONTEXT.ContextRecord 40 +# DISPATCHER_CONTEXT.LanguageHandler 48 +# DISPATCHER_CONTEXT.HandlerData 56 +# UNW_FLAG_NHANDLER 0 +# ExceptionContinueSearch 1 +# +# In order to tie the handler to the function one has to compose +# couple of structures: one for .xdata segment and one for .pdata. +# +# UNWIND_INFO structure for .xdata segment would be +# +# function_unwind_info: +# .byte 9,0,0,0 +# .rva handler +# +# This structure designates exception handler for a function with +# zero-length prologue, no stack frame or frame register. +# +# To facilitate composing of .pdata structures, auto-generated "gear" +# prologue copies rsp value to rax and denotes next instruction with +# .LSEH_begin_{function_name} label. This essentially defines the SEH +# styling rule mentioned in the beginning. Position of this label is +# chosen in such manner that possible exceptions raised in the "gear" +# prologue would be accounted to caller and unwound from latter's frame. +# End of function is marked with respective .LSEH_end_{function_name} +# label. To summarize, .pdata segment would contain +# +# .rva .LSEH_begin_function +# .rva .LSEH_end_function +# .rva function_unwind_info +# +# Reference to function_unwind_info from .xdata segment is the anchor. +# In case you wonder why references are 32-bit .rvas and not 64-bit +# .quads. References put into these two segments are required to be +# *relative* to the base address of the current binary module, a.k.a. +# image base. No Win64 module, be it .exe or .dll, can be larger than +# 2GB and thus such relative references can be and are accommodated in +# 32 bits. +# +# Having reviewed the example function code, one can argue that "movq +# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix +# rax would contain an undefined value. If this "offends" you, use +# another register and refrain from modifying rax till magic_point is +# reached, i.e. as if it was a non-volatile register. If more registers +# are required prior [variable] frame setup is completed, note that +# nobody says that you can have only one "magic point." You can +# "liberate" non-volatile registers by denoting last stack off-load +# instruction and reflecting it in finer grade unwind logic in handler. +# After all, isn't it why it's called *language-specific* handler... +# +# SE handlers are also involved in unwinding stack when executable is +# profiled or debugged. Profiling implies additional limitations that +# are too subtle to discuss here. For now it's sufficient to say that +# in order to simplify handlers one should either a) offload original +# %rsp to stack (like discussed above); or b) if you have a register to +# spare for frame pointer, choose volatile one. +# +# (*) Note that we're talking about run-time, not debug-time. Lack of +# unwind information makes debugging hard on both Windows and +# Unix. "Unlike" refers to the fact that on Unix signal handler +# will always be invoked, core dumped and appropriate exit code +# returned to parent (for user notification). +# +######################################################################## +# As of May 2020 an alternative approach that works with both exceptions +# and debugging/profiling was implemented by re-purposing DWARF .cfi +# annotations even for Win64 unwind tables' generation. Unfortunately, +# but not really unexpectedly, it imposes additional limitations on +# coding style. Probably most significant limitation is that frame +# pointer has to be at 16*n distance from stack pointer at the exit +# from prologue. But first things first. There are two additional +# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue, +# that need to be added to all functions marked with additional .type +# tag (see example below). There are "do's and don'ts" for prologue +# and epilogue. It shouldn't come as surprise that in prologue one may +# not modify non-volatile registers, but one may not modify %r11 either. +# This is because it's used as temporary frame pointer(*). There is one +# exception to this rule, and it's setting up frame pointer that is +# non-volatile or %r11. But it must be last instruction in the prologue. +# Constraints for epilogue, or rather on its boundary, depend on whether +# the frame is fixed- or variable-length. In fixed-frame subroutine +# stack pointer has to be restored in the last instruction prior the +# .cfi_epilogue directive. If it's variable-frame subroutine, and a +# non-volatile register was used as frame pointer, then last instruction +# prior the directive has to restore its original value. This means that +# final stack pointer adjustment would have to be pushed past the +# directive. Normally this would render the epilogue non-unwindable, so +# special care has to be taken. To resolve the dilemma, copy frame +# pointer to a volatile register in advance. To give an example: +# +# .type rbp_as_frame_pointer,\@function,3,"unwind" # mind extra tag! +# rbp_as_frame_pointer: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# mov %rsp,%rbp # last instruction in prologue +# .cfi_def_cfa_register %rbp # %rsp-%rbp has to be 16*n, e.g. 16*0 +# .cfi_end_prologue +# sub \$40,%rsp +# and \$-64,%rsp +# ... +# mov %rbp,%r11 +# .cfi_def_cfa_register %r11 # copy frame pointer to volatile %r11 +# mov 0(%rbp),%rbx +# mov 8(%rbp),%rbp # last instruction prior epilogue +# .cfi_epilogue # may not change %r11 in epilogue +# lea 16(%r11),%rsp +# ret +# .cfi_endproc +# .size rbp_as_frame_pointer,.-rbp_as_frame_pointer +# +# To give an example of fixed-frame subroutine for reference: +# +# .type fixed_frame,\@function,3,"unwind" # mind extra tag! +# fixed_frame: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# sub \$40,%rsp +# .cfi_adjust_cfa_offset 40 +# .cfi_end_prologue +# ... +# mov 40(%rsp),%rbx +# mov 48(%rsp),%rbp +# lea 56(%rsp),%rsp +# .cfi_adjust_cfa_offset -56 +# .cfi_epilogue +# ret +# .cfi_endproc +# .size fixed_frame,.-fixed_frame +# +# As for epilogue itself, one can only work on non-volatile registers. +# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi. +# +# On a final note, mixing old-style and modernized subroutines in the +# same file takes some trickery. Ones of the new kind have to appear +# after old-style ones. This has everything to do with the fact that +# entries in the .pdata segment have to appear in strictly same order +# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION +# structures get mechanically appended to whatever existing .pdata. +# +# (*) Just in case, why %r11 and not %rax. This has everything to do +# with the way UNWIND_INFO is, one just can't designate %rax as +# frame pointer. diff --git a/crypto/blst_src/blst_t.hpp b/crypto/blst_src/blst_t.hpp new file mode 100644 index 00000000000..1b150da30ce --- /dev/null +++ b/crypto/blst_src/blst_t.hpp @@ -0,0 +1,538 @@ +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef __BLST_T_HPP__ +#define __BLST_T_HPP__ + +/* + * These templates, blst_384_t and blst_256_t, allow to instantiate slim + * C++ shims to blst assembly with arbitrary moduli. Well, not literally + * arbitrary, as there are limitations. Most notably blst_384_t can not + * actually accommodate 384-bit moduli, only 383 and narrower. This is + * because of ct_inverse_mod_383's limitation. Though if you abstain + * from the reciprocal() method, even 384-bit modulus would work. As for + * blst_256_t, modulus has to be not larger than 2^256-2^192-1. + */ + +#ifdef __GNUC__ +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +extern "C" { +#include "vect.h" +} +#include "bytes.h" + +#undef launder // avoid conflict with C++ >=17 + +#ifdef __GNUC__ +# pragma GCC diagnostic pop +#endif + +static inline void vec_left_align(limb_t *out, const limb_t *inp, size_t n) +{ + const unsigned int nbits = sizeof(inp[0])*8; + unsigned int align = 0; + limb_t top = inp[n-1]; + + if (top) { + while ((top >> (nbits-1)) == 0) + top <<= 1, align++; + } + if (align) { + while (--n) { + limb_t next = inp[n-1]; + out[n] = top | next >> (nbits-align); + top = next << align; + } + out[0] = top; + } else { + for (size_t i = 0; i < n-1; i++) + out[i] = inp[i]; + out[n-1] = top; + } +} + +constexpr static inline size_t vec_nbits(const limb_t *inp, size_t n) +{ + const unsigned int nbits = sizeof(inp[0])*8; + size_t align = 0; + limb_t top = inp[n-1]; + + while ((top >> (nbits-1)) == 0) + top <<= 1, align++; + + return n*nbits - align; +} + +template +class blst_384_t { +private: + vec384 val; + + inline operator const limb_t*() const { return val; } + inline operator limb_t*() { return val; } + inline limb_t& operator[](size_t i) { return val[i]; } + inline const limb_t& operator[](size_t i) const { return val[i]; } + +public: + static const size_t n = sizeof(vec384)/sizeof(limb_t); + static const size_t nbits = vec_nbits(MOD, n); + typedef byte pow_t[384/8]; + + inline blst_384_t() {} + inline blst_384_t(const vec384 p, bool align = false) + { + if (align) + vec_left_align(val, p, n); + else + vec_copy(val, p, sizeof(val)); + } + inline blst_384_t(uint64_t a) + { + vec_zero(val, sizeof(val)); + val[0] = a; + if (a) to(); + } + inline blst_384_t(int a) : blst_384_t((uint64_t)a) {} + + inline void to_scalar(pow_t& scalar) const + { + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) { + from_mont_384((limb_t *)scalar, val, MOD, M0); + } else { + vec384 out; + from_mont_384(out, val, MOD, M0); + le_bytes_from_limbs(scalar, out, sizeof(pow_t)); + vec_zero(out, sizeof(out)); + } + } + + static inline const blst_384_t& one() + { return *reinterpret_cast(ONE); } + + inline blst_384_t& to() + { mul_mont_384(val, RR, val, MOD, M0); return *this; } + inline blst_384_t& from() + { from_mont_384(val, val, MOD, M0); return *this; } + + inline void store(limb_t *p) const + { vec_copy(p, val, sizeof(val)); } + + inline blst_384_t& operator+=(const blst_384_t& b) + { add_mod_384(val, val, b, MOD); return *this; } + friend inline blst_384_t operator+(const blst_384_t& a, const blst_384_t& b) + { + blst_384_t ret; + add_mod_384(ret, a, b, MOD); + return ret; + } + + inline blst_384_t& operator<<=(unsigned l) + { lshift_mod_384(val, val, l, MOD); return *this; } + friend inline blst_384_t operator<<(const blst_384_t& a, unsigned l) + { + blst_384_t ret; + lshift_mod_384(ret, a, l, MOD); + return ret; + } + + inline blst_384_t& operator>>=(unsigned r) + { rshift_mod_384(val, val, r, MOD); return *this; } + friend inline blst_384_t operator>>(blst_384_t a, unsigned r) + { + blst_384_t ret; + rshift_mod_384(ret, a, r, MOD); + return ret; + } + + inline blst_384_t& operator-=(const blst_384_t& b) + { sub_mod_384(val, val, b, MOD); return *this; } + friend inline blst_384_t operator-(const blst_384_t& a, const blst_384_t& b) + { + blst_384_t ret; + sub_mod_384(ret, a, b, MOD); + return ret; + } + + inline blst_384_t& cneg(bool flag) + { cneg_mod_384(val, val, flag, MOD); return *this; } + friend inline blst_384_t cneg(const blst_384_t& a, bool flag) + { + blst_384_t ret; + cneg_mod_384(ret, a, flag, MOD); + return ret; + } + friend inline blst_384_t operator-(const blst_384_t& a) + { + blst_384_t ret; + cneg_mod_384(ret, a, true, MOD); + return ret; + } + + inline blst_384_t& operator*=(const blst_384_t& a) + { + if (this == &a) sqr_mont_384(val, val, MOD, M0); + else mul_mont_384(val, val, a, MOD, M0); + return *this; + } + friend inline blst_384_t operator*(const blst_384_t& a, const blst_384_t& b) + { + blst_384_t ret; + if (&a == &b) sqr_mont_384(ret, a, MOD, M0); + else mul_mont_384(ret, a, b, MOD, M0); + return ret; + } + + // simplified exponentiation, but mind the ^ operator's precedence! + friend inline blst_384_t operator^(const blst_384_t& a, unsigned p) + { + if (p < 2) { + abort(); + } else if (p == 2) { + blst_384_t ret; + sqr_mont_384(ret, a, MOD, M0); + return ret; + } else { + blst_384_t ret; + sqr_mont_384(ret, a, MOD, M0); + for (p -= 2; p--;) + mul_mont_384(ret, ret, a, MOD, M0); + return ret; + } + } + inline blst_384_t& operator^=(unsigned p) + { + if (p < 2) { + abort(); + } else if (p == 2) { + sqr_mont_384(val, val, MOD, M0); + return *this; + } + return *this = *this^p; + } + inline blst_384_t operator()(unsigned p) + { return *this^p; } + friend inline blst_384_t sqr(const blst_384_t& a) + { return a^2; } + + inline bool is_zero() const + { return vec_is_zero(val, sizeof(val)); } + + inline void zero() + { vec_zero(val, sizeof(val)); } + + blst_384_t reciprocal() const + { + static const blst_384_t MODx{MOD, true}; + static const blst_384_t RRx4 = *reinterpret_cast(RR)<<2; + union { vec768 x; vec384 r[2]; } temp; + + ct_inverse_mod_383(temp.x, val, MOD, MODx); + redc_mont_384(temp.r[0], temp.x, MOD, M0); + mul_mont_384(temp.r[0], temp.r[0], RRx4, MOD, M0); + + return *reinterpret_cast(temp.r[0]); + } + friend inline blst_384_t operator/(unsigned one, const blst_384_t& a) + { + if (one == 1) + return a.reciprocal(); + abort(); + } + friend inline blst_384_t operator/(const blst_384_t& a, const blst_384_t& b) + { return a * b.reciprocal(); } + inline blst_384_t& operator/=(const blst_384_t& a) + { return *this *= a.reciprocal(); } + +#ifndef NDEBUG + inline blst_384_t(const char *hexascii) + { limbs_from_hexascii(val, sizeof(val), hexascii); to(); } + + friend inline bool operator==(const blst_384_t& a, const blst_384_t& b) + { return vec_is_equal(a, b, sizeof(vec384)); } + friend inline bool operator!=(const blst_384_t& a, const blst_384_t& b) + { return !vec_is_equal(a, b, sizeof(vec384)); } + +# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard + friend std::ostream& operator<<(std::ostream& os, const blst_384_t& obj) + { + unsigned char be[sizeof(obj)]; + char buf[2+2*sizeof(obj)+1], *str = buf; + + be_bytes_from_limbs(be, blst_384_t{obj}.from(), sizeof(obj)); + + *str++ = '0', *str++ = 'x'; + for (size_t i = 0; i < sizeof(obj); i++) + *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]); + *str = '\0'; + + return os << buf; + } +# endif +#endif +}; + +template +class blst_256_t { + vec256 val; + + inline operator const limb_t*() const { return val; } + inline operator limb_t*() { return val; } + inline limb_t& operator[](size_t i) { return val[i]; } + inline const limb_t& operator[](size_t i) const { return val[i]; } + +public: + static const size_t n = sizeof(vec256)/sizeof(limb_t); + static const size_t nbits = vec_nbits(MOD, n); + typedef byte pow_t[256/8]; + + inline blst_256_t() {} + inline blst_256_t(const vec256 p, bool align = false) + { + if (align) + vec_left_align(val, p, n); + else + vec_copy(val, p, sizeof(val)); + } + inline blst_256_t(uint64_t a) + { + vec_zero(val, sizeof(val)); + val[0] = a; + if (a) to(); + } + inline blst_256_t(int a) : blst_256_t((uint64_t)a) {} + + inline void to_scalar(pow_t& scalar) const + { + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) { + from_mont_256((limb_t *)scalar, val, MOD, M0); + } else { + vec256 out; + from_mont_256(out, val, MOD, M0); + le_bytes_from_limbs(scalar, out, sizeof(pow_t)); + vec_zero(out, sizeof(out)); + } + } + + static inline const blst_256_t& one() + { return *reinterpret_cast(ONE); } + + inline blst_256_t& to() + { mul_mont_sparse_256(val, val, RR, MOD, M0); return *this; } + inline blst_256_t& to(const uint64_t a[2*n]) + { + mul_mont_sparse_256(val, RR, (const limb_t*)(a + n), MOD, M0); + vec256 lo{0}; + add_mod_256(lo, lo, (const limb_t*)a, MOD); + add_mod_256(val, val, lo, MOD); + mul_mont_sparse_256(val, RR, val, MOD, M0); + + return *this; + } + blst_256_t& to(const unsigned char* bytes, size_t n, bool le = false) + { + vec_zero(val, sizeof(val)); + + vec256 digit, zero{0}; + size_t rem = (n - 1) % 32 + 1; + n -= rem; + + if (le) { + limbs_from_le_bytes(val, bytes += n, rem); + mul_mont_sparse_256(val, RR, val, MOD, M0); + while (n) { + limbs_from_le_bytes(digit, bytes -= 32, 32); + add_mod_256(digit, digit, zero, MOD); + add_mod_256(val, val, digit, MOD); + mul_mont_sparse_256(val, RR, val, MOD, M0); + n -= 32; + } + } else { + limbs_from_be_bytes(val, bytes, rem); + mul_mont_sparse_256(val, RR, val, MOD, M0); + bytes += rem; + while (n) { + limbs_from_be_bytes(digit, bytes, 32); + add_mod_256(digit, digit, zero, MOD); + add_mod_256(val, val, digit, MOD); + mul_mont_sparse_256(val, RR, val, MOD, M0); + bytes += 32; + n -= 32; + } + } + + return *this; + } + + inline blst_256_t& from() + { from_mont_256(val, val, MOD, M0); return *this; } + + inline void store(limb_t *p) const + { vec_copy(p, val, sizeof(val)); } + + inline blst_256_t& operator+=(const blst_256_t& b) + { add_mod_256(val, val, b, MOD); return *this; } + friend inline blst_256_t operator+(const blst_256_t& a, const blst_256_t& b) + { + blst_256_t ret; + add_mod_256(ret, a, b, MOD); + return ret; + } + + inline blst_256_t& operator<<=(unsigned l) + { lshift_mod_256(val, val, l, MOD); return *this; } + friend inline blst_256_t operator<<(const blst_256_t& a, unsigned l) + { + blst_256_t ret; + lshift_mod_256(ret, a, l, MOD); + return ret; + } + + inline blst_256_t& operator>>=(unsigned r) + { lshift_mod_256(val, val, r, MOD); return *this; } + friend inline blst_256_t operator>>(blst_256_t a, unsigned r) + { + blst_256_t ret; + lshift_mod_256(ret, a, r, MOD); + return ret; + } + + inline blst_256_t& operator-=(const blst_256_t& b) + { sub_mod_256(val, val, b, MOD); return *this; } + friend inline blst_256_t operator-(const blst_256_t& a, const blst_256_t& b) + { + blst_256_t ret; + sub_mod_256(ret, a, b, MOD); + return ret; + } + + inline blst_256_t& cneg(bool flag) + { cneg_mod_256(val, val, flag, MOD); return *this; } + friend inline blst_256_t cneg(const blst_256_t& a, bool flag) + { + blst_256_t ret; + cneg_mod_256(ret, a, flag, MOD); + return ret; + } + friend inline blst_256_t operator-(const blst_256_t& a) + { + blst_256_t ret; + cneg_mod_256(ret, a, true, MOD); + return ret; + } + + inline blst_256_t& operator*=(const blst_256_t& a) + { + if (this == &a) sqr_mont_sparse_256(val, val, MOD, M0); + else mul_mont_sparse_256(val, val, a, MOD, M0); + return *this; + } + friend inline blst_256_t operator*(const blst_256_t& a, const blst_256_t& b) + { + blst_256_t ret; + if (&a == &b) sqr_mont_sparse_256(ret, a, MOD, M0); + else mul_mont_sparse_256(ret, a, b, MOD, M0); + return ret; + } + + // simplified exponentiation, but mind the ^ operator's precedence! + friend inline blst_256_t operator^(const blst_256_t& a, unsigned p) + { + if (p < 2) { + abort(); + } else if (p == 2) { + blst_256_t ret; + sqr_mont_sparse_256(ret, a, MOD, M0); + return ret; + } else { + blst_256_t ret; + sqr_mont_sparse_256(ret, a, MOD, M0); + for (p -= 2; p--;) + mul_mont_sparse_256(ret, ret, a, MOD, M0); + return ret; + } + } + inline blst_256_t& operator^=(unsigned p) + { + if (p < 2) { + abort(); + } else if (p == 2) { + sqr_mont_sparse_256(val, val, MOD, M0); + return *this; + } + return *this = *this^p; + } + inline blst_256_t operator()(unsigned p) + { return *this^p; } + friend inline blst_256_t sqr(const blst_256_t& a) + { return a^2; } + + inline bool is_zero() const + { return vec_is_zero(val, sizeof(val)); } + + inline void zero() + { vec_zero(val, sizeof(val)); } + + blst_256_t reciprocal() const + { + static const blst_256_t MODx{MOD, true}; + union { vec512 x; vec256 r[2]; } temp; + + ct_inverse_mod_256(temp.x, val, MOD, MODx); + redc_mont_256(temp.r[0], temp.x, MOD, M0); + mul_mont_sparse_256(temp.r[0], temp.r[0], RR, MOD, M0); + + return *reinterpret_cast(temp.r[0]); + } + friend inline blst_256_t operator/(int one, const blst_256_t& a) + { + if (one == 1) + return a.reciprocal(); + abort(); + } + friend inline blst_256_t operator/(const blst_256_t& a, const blst_256_t& b) + { return a * b.reciprocal(); } + inline blst_256_t& operator/=(const blst_256_t& a) + { return *this *= a.reciprocal(); } + +#ifndef NDEBUG + inline blst_256_t(const char *hexascii) + { limbs_from_hexascii(val, sizeof(val), hexascii); to(); } + + friend inline bool operator==(const blst_256_t& a, const blst_256_t& b) + { return vec_is_equal(a, b, sizeof(vec256)); } + friend inline bool operator!=(const blst_256_t& a, const blst_256_t& b) + { return !vec_is_equal(a, b, sizeof(vec256)); } + +# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard + friend std::ostream& operator<<(std::ostream& os, const blst_256_t& obj) + { + unsigned char be[sizeof(obj)]; + char buf[2+2*sizeof(obj)+1], *str=buf; + + be_bytes_from_limbs(be, blst_256_t{obj}.from(), sizeof(obj)); + + *str++ = '0', *str++ = 'x'; + for (size_t i = 0; i < sizeof(obj); i++) + *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]); + *str = '\0'; + + return os << buf; + } +# endif +#endif +}; +#endif diff --git a/crypto/blst_src/build/assembly.S b/crypto/blst_src/build/assembly.S new file mode 100644 index 00000000000..a1a7c5416e0 --- /dev/null +++ b/crypto/blst_src/build/assembly.S @@ -0,0 +1,123 @@ +#if defined(__x86_64) || defined(__x86_64__) +# if defined(__ELF__) +# if defined(__BLST_PORTABLE__) +# include "elf/sha256-portable-x86_64.s" +# else +# include "elf/sha256-x86_64.s" +# endif +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/ctx_inverse_mod_384-x86_64.s" +# else +# include "elf/ctq_inverse_mod_384-x86_64.s" +# endif +# include "elf/add_mod_384-x86_64.s" +# include "elf/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/mulx_mont_384-x86_64.s" +# include "elf/mulx_mont_256-x86_64.s" +# else +# include "elf/mulq_mont_384-x86_64.s" +# include "elf/mulq_mont_256-x86_64.s" +# endif +# include "elf/add_mod_256-x86_64.s" +# include "elf/ct_inverse_mod_256-x86_64.s" +# include "elf/div3w-x86_64.s" +# include "elf/ct_is_square_mod_384-x86_64.s" +# elif defined(_WIN64) || defined(__CYGWIN__) +# if defined(__BLST_PORTABLE__) +# include "coff/sha256-portable-x86_64.s" +# else +# include "coff/sha256-x86_64.s" +# endif +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/ctx_inverse_mod_384-x86_64.s" +# else +# include "coff/ctq_inverse_mod_384-x86_64.s" +# endif +# include "coff/add_mod_384-x86_64.s" +# include "coff/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/mulx_mont_384-x86_64.s" +# include "coff/mulx_mont_256-x86_64.s" +# else +# include "coff/mulq_mont_384-x86_64.s" +# include "coff/mulq_mont_256-x86_64.s" +# endif +# include "coff/add_mod_256-x86_64.s" +# include "coff/ct_inverse_mod_256-x86_64.s" +# include "coff/div3w-x86_64.s" +# include "coff/ct_is_square_mod_384-x86_64.s" +# elif defined(__APPLE__) +# include "mach-o/sha256-x86_64.s" +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "mach-o/ctx_inverse_mod_384-x86_64.s" +# else +# include "mach-o/ctq_inverse_mod_384-x86_64.s" +# endif +# include "mach-o/add_mod_384-x86_64.s" +# include "mach-o/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "mach-o/mulx_mont_384-x86_64.s" +# include "mach-o/mulx_mont_256-x86_64.s" +# else +# include "mach-o/mulq_mont_384-x86_64.s" +# include "mach-o/mulq_mont_256-x86_64.s" +# endif +# include "mach-o/add_mod_256-x86_64.s" +# include "mach-o/ct_inverse_mod_256-x86_64.s" +# include "mach-o/div3w-x86_64.s" +# include "mach-o/ct_is_square_mod_384-x86_64.s" +# endif +#elif defined(__aarch64__) +# if defined(__ELF__) +# include "elf/sha256-armv8.S" +# include "elf/ct_inverse_mod_384-armv8.S" +# include "elf/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "elf/mul_mont_384-armv8.S" +# include "elf/mul_mont_256-armv8.S" +# include "elf/add_mod_256-armv8.S" +# include "elf/ct_inverse_mod_256-armv8.S" +# include "elf/div3w-armv8.S" +# include "elf/ct_is_square_mod_384-armv8.S" +# elif defined(_WIN64) +# include "coff/sha256-armv8.S" +# include "coff/ct_inverse_mod_384-armv8.S" +# include "coff/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "coff/mul_mont_384-armv8.S" +# include "coff/mul_mont_256-armv8.S" +# include "coff/add_mod_256-armv8.S" +# include "coff/ct_inverse_mod_256-armv8.S" +# include "coff/div3w-armv8.S" +# include "coff/ct_is_square_mod_384-armv8.S" +# elif defined(__APPLE__) +# include "mach-o/sha256-armv8.S" +# include "mach-o/ct_inverse_mod_384-armv8.S" +# include "mach-o/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "mach-o/mul_mont_384-armv8.S" +# include "mach-o/mul_mont_256-armv8.S" +# include "mach-o/add_mod_256-armv8.S" +# include "mach-o/ct_inverse_mod_256-armv8.S" +# include "mach-o/div3w-armv8.S" +# include "mach-o/ct_is_square_mod_384-armv8.S" +# endif +#elif defined(__BLST_NO_ASM__) || \ + (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) +/* inaccurate way to detect a 32-bit processor, but it's close enough */ +#else +# error "unsupported platform" +#endif diff --git a/crypto/blst_src/build/bindings_trim.pl b/crypto/blst_src/build/bindings_trim.pl new file mode 100755 index 00000000000..90f914578d9 --- /dev/null +++ b/crypto/blst_src/build/bindings_trim.pl @@ -0,0 +1,37 @@ +#!/usr/bin/env perl + +# read whole file +while(<>) { push @file, $_; } + +# traverse and remove auto-generated PartialEq for chosen types +for (my $i = 0; $i <= $#file; $i++) { + if (@file[$i] =~ m/struct\s+blst_p[12]/) { + @file[$i-1] =~ s/,\s*PartialEq//; + } elsif (@file[$i] =~ m/struct\s+blst_fp12/) { + @file[$i-1] =~ s/,\s*(?:Default|PartialEq)//g; + } elsif (@file[$i] =~ m/struct\s+(blst_pairing|blst_uniq)/) { + @file[$i-1] =~ s/,\s*(?:Copy|Clone|Eq|PartialEq)//g; + } elsif (@file[$i] =~ m/struct\s+blst_scalar/) { + @file[$i-1] =~ s/,\s*Copy//; + @file[$i-1] =~ s/\)/, Zeroize\)/; + splice @file, $i, 0, "#[zeroize(drop)]\n"; $i++; + } elsif (@file[$i] =~ m/assert_eq!\($/) { + @file[++$i] =~ s/unsafe\s*\{\s*&\(\*\(::std::ptr::null::<(\w+)>\(\)\)\)\.(\w+).*\}/offsetof!($1, $2)/; + } +} + +print << '___'; +#[cfg(test)] +macro_rules! offsetof { + ($type:ty, $field:tt) => { + { + let v = <$type>::default(); + (&v.$field as *const _ as usize) - (&v as *const _ as usize) + } + }; +} +___ +# print the file +print @file; + +close STDOUT; diff --git a/crypto/blst_src/build/coff/add_mod_256-armv8.S b/crypto/blst_src/build/coff/add_mod_256-armv8.S new file mode 100644 index 00000000000..27b64ef4ca4 --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_256-armv8.S @@ -0,0 +1,397 @@ +.text + +.globl add_mod_256 + +.def add_mod_256; +.type 32; +.endef +.p2align 5 +add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl mul_by_3_mod_256 + +.def mul_by_3_mod_256; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl lshift_mod_256 + +.def lshift_mod_256; +.type 32; +.endef +.p2align 5 +lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,.Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl rshift_mod_256 + +.def rshift_mod_256; +.type 32; +.endef +.p2align 5 +rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,.Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl cneg_mod_256 + +.def cneg_mod_256; +.type 32; +.endef +.p2align 5 +cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret + + +.globl sub_mod_256 + +.def sub_mod_256; +.type 32; +.endef +.p2align 5 +sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + + +.globl check_mod_256 + +.def check_mod_256; +.type 32; +.endef +.p2align 5 +check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret + + +.globl add_n_check_mod_256 + +.def add_n_check_mod_256; +.type 32; +.endef +.p2align 5 +add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + + +.globl sub_n_check_mod_256 + +.def sub_n_check_mod_256; +.type 32; +.endef +.p2align 5 +sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + diff --git a/crypto/blst_src/build/coff/add_mod_256-x86_64.s b/crypto/blst_src/build/coff/add_mod_256-x86_64.s new file mode 100644 index 00000000000..f88e6189ca5 --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_256-x86_64.s @@ -0,0 +1,911 @@ +.text + +.globl add_mod_256 + +.def add_mod_256; .scl 2; .type 32; .endef +.p2align 5 +add_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_add_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_add_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_256: + + +.globl mul_by_3_mod_256 + +.def mul_by_3_mod_256; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_mul_by_3_mod_256: + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 + + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_256: + +.def __lshift_mod_256; .scl 3; .type 32; .endef +.p2align 5 +__lshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 + + + +.globl lshift_mod_256 + +.def lshift_mod_256; .scl 2; .type 32; .endef +.p2align 5 +lshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_lshift_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_lshift_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_lshift_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_lshift_mod_256: + + +.globl rshift_mod_256 + +.def rshift_mod_256; .scl 2; .type 32; .endef +.p2align 5 +rshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_rshift_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_rshift_mod_256: + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_rshift_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_rshift_mod_256: + + +.globl cneg_mod_256 + +.def cneg_mod_256; .scl 2; .type 32; .endef +.p2align 5 +cneg_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_cneg_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_cneg_mod_256: + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_cneg_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_cneg_mod_256: + + +.globl sub_mod_256 + +.def sub_mod_256; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sub_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sub_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_256: + + +.globl check_mod_256 + +.def check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_check_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax +.LSEH_epilogue_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_check_mod_256: + + +.globl add_n_check_mod_256 + +.def add_n_check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +add_n_check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_n_check_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_add_n_check_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_add_n_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_n_check_mod_256: + + +.globl sub_n_check_mod_256 + +.def sub_n_check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +sub_n_check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_n_check_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sub_n_check_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sub_n_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_n_check_mod_256: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_256 +.rva .LSEH_body_add_mod_256 +.rva .LSEH_info_add_mod_256_prologue + +.rva .LSEH_body_add_mod_256 +.rva .LSEH_epilogue_add_mod_256 +.rva .LSEH_info_add_mod_256_body + +.rva .LSEH_epilogue_add_mod_256 +.rva .LSEH_end_add_mod_256 +.rva .LSEH_info_add_mod_256_epilogue + +.rva .LSEH_begin_mul_by_3_mod_256 +.rva .LSEH_body_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_prologue + +.rva .LSEH_body_mul_by_3_mod_256 +.rva .LSEH_epilogue_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_body + +.rva .LSEH_epilogue_mul_by_3_mod_256 +.rva .LSEH_end_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_epilogue + +.rva .LSEH_begin_lshift_mod_256 +.rva .LSEH_body_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_prologue + +.rva .LSEH_body_lshift_mod_256 +.rva .LSEH_epilogue_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_body + +.rva .LSEH_epilogue_lshift_mod_256 +.rva .LSEH_end_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_epilogue + +.rva .LSEH_begin_rshift_mod_256 +.rva .LSEH_body_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_prologue + +.rva .LSEH_body_rshift_mod_256 +.rva .LSEH_epilogue_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_body + +.rva .LSEH_epilogue_rshift_mod_256 +.rva .LSEH_end_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_epilogue + +.rva .LSEH_begin_cneg_mod_256 +.rva .LSEH_body_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_prologue + +.rva .LSEH_body_cneg_mod_256 +.rva .LSEH_epilogue_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_body + +.rva .LSEH_epilogue_cneg_mod_256 +.rva .LSEH_end_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_epilogue + +.rva .LSEH_begin_sub_mod_256 +.rva .LSEH_body_sub_mod_256 +.rva .LSEH_info_sub_mod_256_prologue + +.rva .LSEH_body_sub_mod_256 +.rva .LSEH_epilogue_sub_mod_256 +.rva .LSEH_info_sub_mod_256_body + +.rva .LSEH_epilogue_sub_mod_256 +.rva .LSEH_end_sub_mod_256 +.rva .LSEH_info_sub_mod_256_epilogue + +.rva .LSEH_epilogue_check_mod_256 +.rva .LSEH_end_check_mod_256 +.rva .LSEH_info_check_mod_256_epilogue + +.rva .LSEH_begin_add_n_check_mod_256 +.rva .LSEH_body_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_prologue + +.rva .LSEH_body_add_n_check_mod_256 +.rva .LSEH_epilogue_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_body + +.rva .LSEH_epilogue_add_n_check_mod_256 +.rva .LSEH_end_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_epilogue + +.rva .LSEH_begin_sub_n_check_mod_256 +.rva .LSEH_body_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_prologue + +.rva .LSEH_body_sub_n_check_mod_256 +.rva .LSEH_epilogue_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_body + +.rva .LSEH_epilogue_sub_n_check_mod_256 +.rva .LSEH_end_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_add_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_3_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_lshift_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_lshift_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_lshift_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_rshift_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_rshift_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_rshift_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_cneg_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_cneg_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_cneg_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_sub_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_add_n_check_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_n_check_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_add_n_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_n_check_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_n_check_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_sub_n_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/add_mod_384-armv8.S b/crypto/blst_src/build/coff/add_mod_384-armv8.S new file mode 100644 index 00000000000..2eff0677f54 --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_384-armv8.S @@ -0,0 +1,1056 @@ +.text + +.globl add_mod_384 + +.def add_mod_384; +.type 32; +.endef +.p2align 5 +add_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __add_mod_384; +.type 32; +.endef +.p2align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl add_mod_384x + +.def add_mod_384x; +.type 32; +.endef +.p2align 5 +add_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl rshift_mod_384 + +.def rshift_mod_384; +.type 32; +.endef +.p2align 5 +rshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __rshift_mod_384; +.type 32; +.endef +.p2align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + + +.globl div_by_2_mod_384 + +.def div_by_2_mod_384; +.type 32; +.endef +.p2align 5 +div_by_2_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl lshift_mod_384 + +.def lshift_mod_384; +.type 32; +.endef +.p2align 5 +lshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __lshift_mod_384; +.type 32; +.endef +.p2align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl mul_by_3_mod_384 + +.def mul_by_3_mod_384; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_8_mod_384 + +.def mul_by_8_mod_384; +.type 32; +.endef +.p2align 5 +mul_by_8_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_3_mod_384x + +.def mul_by_3_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_8_mod_384x + +.def mul_by_8_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_8_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl cneg_mod_384 + +.def cneg_mod_384; +.type 32; +.endef +.p2align 5 +cneg_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl sub_mod_384 + +.def sub_mod_384; +.type 32; +.endef +.p2align 5 +sub_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __sub_mod_384; +.type 32; +.endef +.p2align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + + +.globl sub_mod_384x + +.def sub_mod_384x; +.type 32; +.endef +.p2align 5 +sub_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_1_plus_i_mod_384x + +.def mul_by_1_plus_i_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_1_plus_i_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl sgn0_pty_mod_384 + +.def sgn0_pty_mod_384; +.type 32; +.endef +.p2align 5 +sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + + +.globl sgn0_pty_mod_384x + +.def sgn0_pty_mod_384x; +.type 32; +.endef +.p2align 5 +sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + +.globl vec_select_32 + +.def vec_select_32; +.type 32; +.endef +.p2align 5 +vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_48 + +.def vec_select_48; +.type 32; +.endef +.p2align 5 +vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_96 + +.def vec_select_96; +.type 32; +.endef +.p2align 5 +vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_select_192 + +.def vec_select_192; +.type 32; +.endef +.p2align 5 +vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_select_144 + +.def vec_select_144; +.type 32; +.endef +.p2align 5 +vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_288 + +.def vec_select_288; +.type 32; +.endef +.p2align 5 +vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_prefetch + +.def vec_prefetch; +.type 32; +.endef +.p2align 5 +vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret + +.globl vec_is_zero_16x + +.def vec_is_zero_16x; +.type 32; +.endef +.p2align 5 +vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, .Loop_is_zero_done + +.Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, .Loop_is_zero + +.Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + +.globl vec_is_equal_16x + +.def vec_is_equal_16x; +.type 32; +.endef +.p2align 5 +vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +.Loop_is_equal: + sub x2, x2, #1 + cbz x2, .Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b .Loop_is_equal + nop + +.Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + diff --git a/crypto/blst_src/build/coff/add_mod_384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384-x86_64.s new file mode 100644 index 00000000000..d1c7ad6e689 --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_384-x86_64.s @@ -0,0 +1,2481 @@ +.text + +.globl add_mod_384 + +.def add_mod_384; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_add_mod_384: + + + call __add_mod_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_add_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384: + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl add_mod_384x + +.def add_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_add_mod_384x: + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 + + movq 24+8(%rsp),%r14 + + movq 24+16(%rsp),%r13 + + movq 24+24(%rsp),%r12 + + movq 24+32(%rsp),%rbx + + movq 24+40(%rsp),%rbp + + leaq 24+48(%rsp),%rsp + +.LSEH_epilogue_add_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384x: + + +.globl rshift_mod_384 + +.def rshift_mod_384; .scl 2; .type 32; .endef +.p2align 5 +rshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_rshift_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_rshift_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz .Loop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_rshift_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_rshift_mod_384: + +.def __rshift_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__rshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 + + +.globl div_by_2_mod_384 + +.def div_by_2_mod_384; .scl 2; .type 32; .endef +.p2align 5 +div_by_2_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_div_by_2_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_div_by_2_mod_384: + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_div_by_2_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_div_by_2_mod_384: + + +.globl lshift_mod_384 + +.def lshift_mod_384; .scl 2; .type 32; .endef +.p2align 5 +lshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_lshift_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_lshift_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_lshift_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_lshift_mod_384: + +.def __lshift_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__lshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 + + + +.globl mul_by_3_mod_384 + +.def mul_by_3_mod_384; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_3_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_384: + +.globl mul_by_8_mod_384 + +.def mul_by_8_mod_384; .scl 2; .type 32; .endef +.p2align 5 +mul_by_8_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_8_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mul_by_8_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_8_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_8_mod_384: + + +.globl mul_by_3_mod_384x + +.def mul_by_3_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_3_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_384x: + +.globl mul_by_8_mod_384x + +.def mul_by_8_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_8_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_8_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_8_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_8_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_8_mod_384x: + + +.globl cneg_mod_384 + +.def cneg_mod_384; .scl 2; .type 32; .endef +.p2align 5 +cneg_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_cneg_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdx + +.LSEH_body_cneg_mod_384: + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_cneg_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_cneg_mod_384: + + +.globl sub_mod_384 + +.def sub_mod_384; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sub_mod_384: + + + call __sub_mod_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384: + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sub_mod_384x + +.def sub_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_sub_mod_384x: + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 + + movq 24+8(%rsp),%r14 + + movq 24+16(%rsp),%r13 + + movq 24+24(%rsp),%r12 + + movq 24+32(%rsp),%rbx + + movq 24+40(%rsp),%rbp + + leaq 24+48(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384x: +.globl mul_by_1_plus_i_mod_384x + +.def mul_by_1_plus_i_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_1_plus_i_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_1_plus_i_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $56,%rsp + +.LSEH_body_mul_by_1_plus_i_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 + + movq 56+8(%rsp),%r14 + + movq 56+16(%rsp),%r13 + + movq 56+24(%rsp),%r12 + + movq 56+32(%rsp),%rbx + + movq 56+40(%rsp),%rbp + + leaq 56+48(%rsp),%rsp + +.LSEH_epilogue_mul_by_1_plus_i_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_1_plus_i_mod_384x: +.globl sgn0_pty_mod_384 + +.def sgn0_pty_mod_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + + +.LSEH_body_sgn0_pty_mod_384: + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + +.LSEH_epilogue_sgn0_pty_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mod_384: + +.globl sgn0_pty_mod_384x + +.def sgn0_pty_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mod_384x: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mod_384x: + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mod_384x: +.globl vec_select_32 + +.def vec_select_32; .scl 2; .type 32; .endef +.p2align 5 +vec_select_32: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 16(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 16(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 16(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-16(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_48 + +.def vec_select_48; .scl 2; .type 32; .endef +.p2align 5 +vec_select_48: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 24(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 24(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 24(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-24(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rcx) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_96 + +.def vec_select_96; .scl 2; .type 32; .endef +.p2align 5 +vec_select_96: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 48(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 48(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 48(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_192 + +.def vec_select_192; .scl 2; .type 32; .endef +.p2align 5 +vec_select_192: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 96(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 96(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 96(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rcx) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rcx) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rcx) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_144 + +.def vec_select_144; .scl 2; .type 32; .endef +.p2align 5 +vec_select_144: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 72(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 72(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 72(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rcx) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_288 + +.def vec_select_288; .scl 2; .type 32; .endef +.p2align 5 +vec_select_288: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 144(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 144(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 144(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rcx) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rcx) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rcx) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rcx) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rcx) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rcx) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rcx) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rcx) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rcx) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rcx) + .byte 0xf3,0xc3 + +.globl vec_prefetch + +.def vec_prefetch; .scl 2; .type 32; .endef +.p2align 5 +vec_prefetch: + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rcx,%rdx,1),%rdx + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + prefetchnta (%rcx) + .byte 0xf3,0xc3 + +.globl vec_is_zero_16x + +.def vec_is_zero_16x; .scl 2; .type 32; .endef +.p2align 5 +vec_is_zero_16x: + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rcx),%xmm0 + leaq 16(%rcx),%rcx + +.Loop_is_zero: + decl %edx + jz .Loop_is_zero_done + movdqu (%rcx),%xmm1 + leaq 16(%rcx),%rcx + por %xmm1,%xmm0 + jmp .Loop_is_zero + +.Loop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 + +.globl vec_is_equal_16x + +.def vec_is_equal_16x; .scl 2; .type 32; .endef +.p2align 5 +vec_is_equal_16x: + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%r8d + movdqu (%rcx),%xmm0 + movdqu (%rdx),%xmm1 + subq %rcx,%rdx + leaq 16(%rcx),%rcx + pxor %xmm1,%xmm0 + +.Loop_is_equal: + decl %r8d + jz .Loop_is_equal_done + movdqu (%rcx),%xmm1 + movdqu (%rcx,%rdx,1),%xmm2 + leaq 16(%rcx),%rcx + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp .Loop_is_equal + +.Loop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %r8d + testq %rax,%rax + cmovnzl %r8d,%eax + xorl $1,%eax + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_384 +.rva .LSEH_body_add_mod_384 +.rva .LSEH_info_add_mod_384_prologue + +.rva .LSEH_body_add_mod_384 +.rva .LSEH_epilogue_add_mod_384 +.rva .LSEH_info_add_mod_384_body + +.rva .LSEH_epilogue_add_mod_384 +.rva .LSEH_end_add_mod_384 +.rva .LSEH_info_add_mod_384_epilogue + +.rva .LSEH_begin_add_mod_384x +.rva .LSEH_body_add_mod_384x +.rva .LSEH_info_add_mod_384x_prologue + +.rva .LSEH_body_add_mod_384x +.rva .LSEH_epilogue_add_mod_384x +.rva .LSEH_info_add_mod_384x_body + +.rva .LSEH_epilogue_add_mod_384x +.rva .LSEH_end_add_mod_384x +.rva .LSEH_info_add_mod_384x_epilogue + +.rva .LSEH_begin_rshift_mod_384 +.rva .LSEH_body_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_prologue + +.rva .LSEH_body_rshift_mod_384 +.rva .LSEH_epilogue_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_body + +.rva .LSEH_epilogue_rshift_mod_384 +.rva .LSEH_end_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_epilogue + +.rva .LSEH_begin_div_by_2_mod_384 +.rva .LSEH_body_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_prologue + +.rva .LSEH_body_div_by_2_mod_384 +.rva .LSEH_epilogue_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_body + +.rva .LSEH_epilogue_div_by_2_mod_384 +.rva .LSEH_end_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_epilogue + +.rva .LSEH_begin_lshift_mod_384 +.rva .LSEH_body_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_prologue + +.rva .LSEH_body_lshift_mod_384 +.rva .LSEH_epilogue_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_body + +.rva .LSEH_epilogue_lshift_mod_384 +.rva .LSEH_end_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_epilogue + +.rva .LSEH_begin_mul_by_3_mod_384 +.rva .LSEH_body_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_prologue + +.rva .LSEH_body_mul_by_3_mod_384 +.rva .LSEH_epilogue_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_body + +.rva .LSEH_epilogue_mul_by_3_mod_384 +.rva .LSEH_end_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_epilogue + +.rva .LSEH_begin_mul_by_8_mod_384 +.rva .LSEH_body_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_prologue + +.rva .LSEH_body_mul_by_8_mod_384 +.rva .LSEH_epilogue_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_body + +.rva .LSEH_epilogue_mul_by_8_mod_384 +.rva .LSEH_end_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_epilogue + +.rva .LSEH_begin_mul_by_3_mod_384x +.rva .LSEH_body_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_prologue + +.rva .LSEH_body_mul_by_3_mod_384x +.rva .LSEH_epilogue_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_body + +.rva .LSEH_epilogue_mul_by_3_mod_384x +.rva .LSEH_end_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_8_mod_384x +.rva .LSEH_body_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_prologue + +.rva .LSEH_body_mul_by_8_mod_384x +.rva .LSEH_epilogue_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_body + +.rva .LSEH_epilogue_mul_by_8_mod_384x +.rva .LSEH_end_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_epilogue + +.rva .LSEH_begin_cneg_mod_384 +.rva .LSEH_body_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_prologue + +.rva .LSEH_body_cneg_mod_384 +.rva .LSEH_epilogue_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_body + +.rva .LSEH_epilogue_cneg_mod_384 +.rva .LSEH_end_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_epilogue + +.rva .LSEH_begin_sub_mod_384 +.rva .LSEH_body_sub_mod_384 +.rva .LSEH_info_sub_mod_384_prologue + +.rva .LSEH_body_sub_mod_384 +.rva .LSEH_epilogue_sub_mod_384 +.rva .LSEH_info_sub_mod_384_body + +.rva .LSEH_epilogue_sub_mod_384 +.rva .LSEH_end_sub_mod_384 +.rva .LSEH_info_sub_mod_384_epilogue + +.rva .LSEH_begin_sub_mod_384x +.rva .LSEH_body_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_prologue + +.rva .LSEH_body_sub_mod_384x +.rva .LSEH_epilogue_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_body + +.rva .LSEH_epilogue_sub_mod_384x +.rva .LSEH_end_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_1_plus_i_mod_384x +.rva .LSEH_body_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_prologue + +.rva .LSEH_body_mul_by_1_plus_i_mod_384x +.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_body + +.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x +.rva .LSEH_end_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue + +.rva .LSEH_begin_sgn0_pty_mod_384 +.rva .LSEH_body_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_prologue + +.rva .LSEH_body_sgn0_pty_mod_384 +.rva .LSEH_epilogue_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_body + +.rva .LSEH_epilogue_sgn0_pty_mod_384 +.rva .LSEH_end_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mod_384x +.rva .LSEH_body_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_prologue + +.rva .LSEH_body_sgn0_pty_mod_384x +.rva .LSEH_epilogue_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_body + +.rva .LSEH_epilogue_sgn0_pty_mod_384x +.rva .LSEH_end_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_add_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_add_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_add_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_rshift_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_rshift_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_rshift_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_div_by_2_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_div_by_2_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_div_by_2_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_lshift_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_lshift_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_lshift_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_3_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_8_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_8_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_8_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_3_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_3_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_8_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_8_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_by_8_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_cneg_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_cneg_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_cneg_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_1_plus_i_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_by_1_plus_i_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x07,0x00 +.byte 0x00,0xe4,0x08,0x00 +.byte 0x00,0xd4,0x09,0x00 +.byte 0x00,0xc4,0x0a,0x00 +.byte 0x00,0x34,0x0b,0x00 +.byte 0x00,0x54,0x0c,0x00 +.byte 0x00,0x74,0x0e,0x00 +.byte 0x00,0x64,0x0f,0x00 +.byte 0x00,0xc2 +.byte 0x00,0x00 +.LSEH_info_mul_by_1_plus_i_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mod_384_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0_pty_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mod_384x_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s new file mode 100644 index 00000000000..79976cc0e7a --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s @@ -0,0 +1,326 @@ +.text + +.def __add_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.globl add_mod_384x384 + +.def add_mod_384x384; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384x384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_add_mod_384x384: + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_add_mod_384x384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384x384: + +.globl sub_mod_384x384 + +.def sub_mod_384x384; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384x384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sub_mod_384x384: + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384x384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384x384: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_384x384 +.rva .LSEH_body_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_prologue + +.rva .LSEH_body_add_mod_384x384 +.rva .LSEH_epilogue_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_body + +.rva .LSEH_epilogue_add_mod_384x384 +.rva .LSEH_end_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_epilogue + +.rva .LSEH_begin_sub_mod_384x384 +.rva .LSEH_body_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_prologue + +.rva .LSEH_body_sub_mod_384x384 +.rva .LSEH_epilogue_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_body + +.rva .LSEH_epilogue_sub_mod_384x384 +.rva .LSEH_end_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_384x384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_add_mod_384x384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_add_mod_384x384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384x384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sub_mod_384x384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sub_mod_384x384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S new file mode 100644 index 00000000000..17c3d25278f --- /dev/null +++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S @@ -0,0 +1,798 @@ +.text + +.globl ct_inverse_mod_256 +.def ct_inverse_mod_256; +.type 32; +.endef +.p2align 5 +ct_inverse_mod_256: +.long 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +.def __smul_256x63; +.type 32; +.endef +.p2align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + + +.def __smul_512x63_tail; +.type 32; +.endef +.p2align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + + +.def __smul_256_n_shift_by_31; +.type 32; +.endef +.p2align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + +.def __ab_approximation_31_256; +.type 32; +.endef +.p2align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +.Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + + +.def __inner_loop_31_256; +.type 32; +.endef +.p2align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, .Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + + +.def __inner_loop_62_256; +.type 32; +.endef +.p2align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +.Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62_256 + + ret + diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s new file mode 100644 index 00000000000..e7d4a6313b1 --- /dev/null +++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1209 @@ +.text + +.globl ct_inverse_mod_256 +.def ct_inverse_mod_256; .scl 2; .type 32; .endef +.p2align 5 +ct_inverse_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_inverse_mod_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1072,%rsp + +.LSEH_body_ct_inverse_mod_256: + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_inverse_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_inverse_mod_256: +.def __smulq_512x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_512x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 + + +.def __smulq_256x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_256x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 + +.def __smulq_256_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulq_256_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_31_256; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_31_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 + +.def __inner_loop_31_256; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_31_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz .Loop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 + + +.def __inner_loop_62_256; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_62_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +.Loop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz .Loop_62_256 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_inverse_mod_256 +.rva .LSEH_body_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_prologue + +.rva .LSEH_body_ct_inverse_mod_256 +.rva .LSEH_epilogue_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_body + +.rva .LSEH_epilogue_ct_inverse_mod_256 +.rva .LSEH_end_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_inverse_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_ct_inverse_mod_256_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x86,0x00 +.byte 0x00,0xe4,0x87,0x00 +.byte 0x00,0xd4,0x88,0x00 +.byte 0x00,0xc4,0x89,0x00 +.byte 0x00,0x34,0x8a,0x00 +.byte 0x00,0x54,0x8b,0x00 +.byte 0x00,0x74,0x8d,0x00 +.byte 0x00,0x64,0x8e,0x00 +.byte 0x00,0x01,0x8c,0x00 +.LSEH_info_ct_inverse_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S new file mode 100644 index 00000000000..65193f1e96a --- /dev/null +++ b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S @@ -0,0 +1,729 @@ +.text + +.globl ct_inverse_mod_383 +.def ct_inverse_mod_383; +.type 32; +.endef +.p2align 5 +ct_inverse_mod_383: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl .Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.def __smul_383x63; +.type 32; +.endef +.p2align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + + +.def __smul_767x63_tail; +.type 32; +.endef +.p2align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + + +.def __smul_383_n_shift_by_62; +.type 32; +.endef +.p2align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + +.def __ab_approximation_62; +.type 32; +.endef +.p2align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +.Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + +.def __inner_loop_62; +.type 32; +.endef +.p2align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +.Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62 + + ret + diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S new file mode 100644 index 00000000000..34336ff486b --- /dev/null +++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S @@ -0,0 +1,334 @@ +.text + +.globl ct_is_square_mod_384 +.def ct_is_square_mod_384; +.type 32; +.endef +.p2align 5 +ct_is_square_mod_384: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the .Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b .Loop_is_square + +.p2align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +.def __smul_384_n_shift_by_30; +.type 32; +.endef +.p2align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + +.def __ab_approximation_30; +.type 32; +.endef +.p2align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + + +.def __inner_loop_30; +.type 32; +.endef +.p2align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, .Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + +.def __inner_loop_48; +.type 32; +.endef +.p2align 4 +__inner_loop_48: +.Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, .Loop_48 + + ret + diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s new file mode 100644 index 00000000000..ee4790321e6 --- /dev/null +++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,505 @@ +.text + +.globl ct_is_square_mod_384 +.def ct_is_square_mod_384; .scl 2; .type 32; .endef +.p2align 5 +ct_is_square_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_is_square_mod_384: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $536,%rsp + +.LSEH_body_ct_is_square_mod_384: + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp .Loop_is_square + +.p2align 5 +.Loop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz .Loop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_is_square_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_is_square_mod_384: + +.def __smulq_384_n_shift_by_30; .scl 3; .type 32; .endef +.p2align 5 +__smulq_384_n_shift_by_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __ab_approximation_30; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 + +.def __inner_loop_30; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +.Loop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz .Loop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 + + +.def __inner_loop_48; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_48: + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +.Loop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz .Loop_48 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_is_square_mod_384 +.rva .LSEH_body_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_prologue + +.rva .LSEH_body_ct_is_square_mod_384 +.rva .LSEH_epilogue_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_body + +.rva .LSEH_epilogue_ct_is_square_mod_384 +.rva .LSEH_end_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_is_square_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_ct_is_square_mod_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x43,0x00 +.byte 0x00,0xe4,0x44,0x00 +.byte 0x00,0xd4,0x45,0x00 +.byte 0x00,0xc4,0x46,0x00 +.byte 0x00,0x34,0x47,0x00 +.byte 0x00,0x54,0x48,0x00 +.byte 0x00,0x74,0x4a,0x00 +.byte 0x00,0x64,0x4b,0x00 +.byte 0x00,0x01,0x49,0x00 +.LSEH_info_ct_is_square_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..42f058a3c8d --- /dev/null +++ b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1221 @@ +.text + +.globl ct_inverse_mod_383 +.def ct_inverse_mod_383; .scl 2; .type 32; .endef +.p2align 5 +ct_inverse_mod_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_inverse_mod_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1112,%rsp + +.LSEH_body_ct_inverse_mod_383: + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_inverse_mod_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_inverse_mod_383: +.def __smulq_767x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_767x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 + +.def __smulq_383x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_383x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __smulq_383_n_shift_by_62; .scl 3; .type 32; .endef +.p2align 5 +__smulq_383_n_shift_by_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_62; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 + +.def __inner_loop_62; .scl 3; .type 32; .endef +.p2align 3 +.long 0 +__inner_loop_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +.Loop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_inverse_mod_383 +.rva .LSEH_body_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_prologue + +.rva .LSEH_body_ct_inverse_mod_383 +.rva .LSEH_epilogue_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_body + +.rva .LSEH_epilogue_ct_inverse_mod_383 +.rva .LSEH_end_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_inverse_mod_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_ct_inverse_mod_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x8b,0x00 +.byte 0x00,0xe4,0x8c,0x00 +.byte 0x00,0xd4,0x8d,0x00 +.byte 0x00,0xc4,0x8e,0x00 +.byte 0x00,0x34,0x8f,0x00 +.byte 0x00,0x54,0x90,0x00 +.byte 0x00,0x74,0x92,0x00 +.byte 0x00,0x64,0x93,0x00 +.byte 0x00,0x01,0x91,0x00 +.LSEH_info_ct_inverse_mod_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..7c13e56eb2a --- /dev/null +++ b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1596 @@ +.text + +.globl ctx_inverse_mod_383 +.def ctx_inverse_mod_383; .scl 2; .type 32; .endef +.p2align 5 +ctx_inverse_mod_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ctx_inverse_mod_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1112,%rsp + +.LSEH_body_ctx_inverse_mod_383: + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ctx_inverse_mod_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ctx_inverse_mod_383: +.def __smulx_767x63; .scl 3; .type 32; .endef +.p2align 5 +__smulx_767x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 + +.def __smulx_383x63; .scl 3; .type 32; .endef +.p2align 5 +__smulx_383x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __smulx_383_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulx_383_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __smulx_191_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulx_191_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_31; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 + +.def __inner_loop_31; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz .Loop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 + + +.def __inner_loop_62; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +.Loop_62: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ctx_inverse_mod_383 +.rva .LSEH_body_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_prologue + +.rva .LSEH_body_ctx_inverse_mod_383 +.rva .LSEH_epilogue_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_body + +.rva .LSEH_epilogue_ctx_inverse_mod_383 +.rva .LSEH_end_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ctx_inverse_mod_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_ctx_inverse_mod_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x8b,0x00 +.byte 0x00,0xe4,0x8c,0x00 +.byte 0x00,0xd4,0x8d,0x00 +.byte 0x00,0xc4,0x8e,0x00 +.byte 0x00,0x34,0x8f,0x00 +.byte 0x00,0x54,0x90,0x00 +.byte 0x00,0x74,0x92,0x00 +.byte 0x00,0x64,0x93,0x00 +.byte 0x00,0x01,0x91,0x00 +.LSEH_info_ctx_inverse_mod_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/div3w-armv8.S b/crypto/blst_src/build/coff/div3w-armv8.S new file mode 100644 index 00000000000..c17b9e38336 --- /dev/null +++ b/crypto/blst_src/build/coff/div3w-armv8.S @@ -0,0 +1,94 @@ +.text + +.globl div_3_limbs +.def div_3_limbs; +.type 32; +.endef +.p2align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + +.globl quot_rem_128 +.def quot_rem_128; +.type 32; +.endef +.p2align 5 +quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + + +.globl quot_rem_64 +.def quot_rem_64; +.type 32; +.endef +.p2align 5 +quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + diff --git a/crypto/blst_src/build/coff/div3w-x86_64.s b/crypto/blst_src/build/coff/div3w-x86_64.s new file mode 100644 index 00000000000..fcfe54480be --- /dev/null +++ b/crypto/blst_src/build/coff/div3w-x86_64.s @@ -0,0 +1,140 @@ +.text + +.globl div_3_limbs + +.def div_3_limbs; .scl 2; .type 32; .endef +.p2align 5 +div_3_limbs: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_div_3_limbs: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +.Loop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz .Loop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_div_3_limbs: +.globl quot_rem_128 + +.def quot_rem_128; .scl 2; .type 32; .endef +.p2align 5 +quot_rem_128: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_quot_rem_128: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_quot_rem_128: + + + + + +.globl quot_rem_64 + +.def quot_rem_64; .scl 2; .type 32; .endef +.p2align 5 +quot_rem_64: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%rax +.LSEH_begin_quot_rem_64: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + movq 8(%rsp),%rdi + movq 16(%rsp),%rsi + .byte 0xf3,0xc3 +.LSEH_end_quot_rem_64: +.section .pdata +.p2align 2 +.section .xdata +.p2align 3 diff --git a/crypto/blst_src/build/coff/mul_mont_256-armv8.S b/crypto/blst_src/build/coff/mul_mont_256-armv8.S new file mode 100644 index 00000000000..8cadbb89344 --- /dev/null +++ b/crypto/blst_src/build/coff/mul_mont_256-armv8.S @@ -0,0 +1,474 @@ +.text + +.globl mul_mont_sparse_256 + +.def mul_mont_sparse_256; +.type 32; +.endef +.p2align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + +.globl sqr_mont_sparse_256 + +.def sqr_mont_sparse_256; +.type 32; +.endef +.p2align 5 +sqr_mont_sparse_256: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + +.globl from_mont_256 + +.def from_mont_256; +.type 32; +.endef +.p2align 5 +from_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.globl redc_mont_256 + +.def redc_mont_256; +.type 32; +.endef +.p2align 5 +redc_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.def __mul_by_1_mont_256; +.type 32; +.endef +.p2align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + diff --git a/crypto/blst_src/build/coff/mul_mont_384-armv8.S b/crypto/blst_src/build/coff/mul_mont_384-armv8.S new file mode 100644 index 00000000000..074f38c495c --- /dev/null +++ b/crypto/blst_src/build/coff/mul_mont_384-armv8.S @@ -0,0 +1,2424 @@ +.text + +.globl add_mod_384x384 +.def add_mod_384x384; +.type 32; +.endef +.p2align 5 +add_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + +.def __add_mod_384x384; +.type 32; +.endef +.p2align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret + + +.globl sub_mod_384x384 +.def sub_mod_384x384; +.type 32; +.endef +.p2align 5 +sub_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + +.def __sub_mod_384x384; +.type 32; +.endef +.p2align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + + +.def __add_mod_384; +.type 32; +.endef +.p2align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.def __sub_mod_384; +.type 32; +.endef +.p2align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl mul_mont_384x + +.def mul_mont_384x; +.type 32; +.endef +.p2align 5 +mul_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_mont_384x + +.def sqr_mont_384x; +.type 32; +.endef +.p2align 5 +sqr_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl mul_mont_384 + +.def mul_mont_384; +.type 32; +.endef +.p2align 5 +mul_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_mont_384; +.type 32; +.endef +.p2align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret + + +.globl sqr_mont_384 + +.def sqr_mont_384; +.type 32; +.endef +.p2align 5 +sqr_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_n_mul_mont_383 + +.def sqr_n_mul_mont_383; +.type 32; +.endef +.p2align 5 +sqr_n_mul_mont_383: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +.Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,.Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + +.def __sqr_384; +.type 32; +.endef +.p2align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + +.globl sqr_384 + +.def sqr_384; +.type 32; +.endef +.p2align 5 +sqr_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl redc_mont_384 + +.def redc_mont_384; +.type 32; +.endef +.p2align 5 +redc_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl from_mont_384 + +.def from_mont_384; +.type 32; +.endef +.p2align 5 +from_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_by_1_mont_384; +.type 32; +.endef +.p2align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + + +.def __redc_tail_mont_384; +.type 32; +.endef +.p2align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl mul_384 + +.def mul_384; +.type 32; +.endef +.p2align 5 +mul_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_384; +.type 32; +.endef +.p2align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + + +.globl mul_382x + +.def mul_382x; +.type 32; +.endef +.p2align 5 +mul_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_382x + +.def sqr_382x; +.type 32; +.endef +.p2align 5 +sqr_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_mont_382x + +.def sqr_mont_382x; +.type 32; +.endef +.p2align 5 +sqr_mont_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_mont_383_nonred; +.type 32; +.endef +.p2align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + + +.globl sgn0_pty_mont_384 + +.def sgn0_pty_mont_384; +.type 32; +.endef +.p2align 5 +sgn0_pty_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sgn0_pty_mont_384x + +.def sgn0_pty_mont_384x; +.type 32; +.endef +.p2align 5 +sgn0_pty_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + diff --git a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s new file mode 100644 index 00000000000..dd1e00fa301 --- /dev/null +++ b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s @@ -0,0 +1,872 @@ +.text + +.globl mul_mont_sparse_256 + +.def mul_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_mul_mont_sparse_256: + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_sparse_256: + +.globl sqr_mont_sparse_256 + +.def sqr_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_sqr_mont_sparse_256: + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_sparse_256: +.def __mulq_mont_sparse_256; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 + + +.globl from_mont_256 + +.def from_mont_256; .scl 2; .type 32; .endef +.p2align 5 +from_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_from_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_from_mont_256: + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_from_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_from_mont_256: + +.globl redc_mont_256 + +.def redc_mont_256; .scl 2; .type 32; .endef +.p2align 5 +redc_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redc_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redc_mont_256: + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redc_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redc_mont_256: +.def __mulq_by_1_mont_256; .scl 3; .type 32; .endef +.p2align 5 +__mulq_by_1_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_mul_mont_sparse_256 +.rva .LSEH_body_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_prologue + +.rva .LSEH_body_mul_mont_sparse_256 +.rva .LSEH_epilogue_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_body + +.rva .LSEH_epilogue_mul_mont_sparse_256 +.rva .LSEH_end_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_epilogue + +.rva .LSEH_begin_sqr_mont_sparse_256 +.rva .LSEH_body_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_prologue + +.rva .LSEH_body_sqr_mont_sparse_256 +.rva .LSEH_epilogue_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_body + +.rva .LSEH_epilogue_sqr_mont_sparse_256 +.rva .LSEH_end_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_epilogue + +.rva .LSEH_begin_from_mont_256 +.rva .LSEH_body_from_mont_256 +.rva .LSEH_info_from_mont_256_prologue + +.rva .LSEH_body_from_mont_256 +.rva .LSEH_epilogue_from_mont_256 +.rva .LSEH_info_from_mont_256_body + +.rva .LSEH_epilogue_from_mont_256 +.rva .LSEH_end_from_mont_256 +.rva .LSEH_info_from_mont_256_epilogue + +.rva .LSEH_begin_redc_mont_256 +.rva .LSEH_body_redc_mont_256 +.rva .LSEH_info_redc_mont_256_prologue + +.rva .LSEH_body_redc_mont_256 +.rva .LSEH_epilogue_redc_mont_256 +.rva .LSEH_info_redc_mont_256_body + +.rva .LSEH_epilogue_redc_mont_256 +.rva .LSEH_end_redc_mont_256 +.rva .LSEH_info_redc_mont_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mul_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mul_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_from_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_from_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_from_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redc_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redc_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redc_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s new file mode 100644 index 00000000000..5663463524a --- /dev/null +++ b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s @@ -0,0 +1,4206 @@ +.text + + + + + + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.globl mul_mont_384x + +.def mul_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $328,%rsp + +.LSEH_body_mul_mont_384x: + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mul_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_384x: +.globl sqr_mont_384x + +.def sqr_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_mont_384x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_384x: + +.globl mul_382x + +.def mul_382x; .scl 2; .type 32; .endef +.p2align 5 +mul_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_mul_382x: + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mul_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_382x: +.globl sqr_382x + +.def sqr_382x; .scl 2; .type 32; .endef +.p2align 5 +sqr_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_sqr_382x: + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_382x: +.globl mul_384 + +.def mul_384; .scl 2; .type 32; .endef +.p2align 5 +mul_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_mul_384: + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_mul_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_384: + +.def __mulq_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 + +.globl sqr_384 + +.def sqr_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_384: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sqr_384: + + + call __sqrq_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_384: + +.def __sqrq_384; .scl 3; .type 32; .endef +.p2align 5 +__sqrq_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 + + +.globl sqr_mont_384 + +.def sqr_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $120,%rsp + +.LSEH_body_sqr_mont_384: + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_384: + + + +.globl redc_mont_384 + +.def redc_mont_384; .scl 2; .type 32; .endef +.p2align 5 +redc_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redc_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redc_mont_384: + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redc_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redc_mont_384: + + + + +.globl from_mont_384 + +.def from_mont_384; .scl 2; .type 32; .endef +.p2align 5 +from_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_from_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_from_mont_384: + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_from_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_from_mont_384: +.def __mulq_by_1_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_by_1_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 + + +.def __redc_tail_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__redc_tail_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sgn0_pty_mont_384 + +.def sgn0_pty_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mont_384: + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mont_384: + +.globl sgn0_pty_mont_384x + +.def sgn0_pty_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mont_384x: + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mont_384x: +.globl mul_mont_384 + +.def mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_mul_mont_384: + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_384: +.def __mulq_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + +.globl sqr_n_mul_mont_384 + +.def sqr_n_mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_n_mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_n_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_n_mul_mont_384: + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_n_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_n_mul_mont_384: + +.globl sqr_n_mul_mont_383 + +.def sqr_n_mul_mont_383; .scl 2; .type 32; .endef +.p2align 5 +sqr_n_mul_mont_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_n_mul_mont_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_n_mul_mont_383: + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_n_mul_mont_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_n_mul_mont_383: +.def __mulq_mont_383_nonred; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_383_nonred: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 + +.globl sqr_mont_382x + +.def sqr_mont_382x; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_mont_382x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_382x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_mul_mont_384x +.rva .LSEH_body_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_prologue + +.rva .LSEH_body_mul_mont_384x +.rva .LSEH_epilogue_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_body + +.rva .LSEH_epilogue_mul_mont_384x +.rva .LSEH_end_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_epilogue + +.rva .LSEH_begin_sqr_mont_384x +.rva .LSEH_body_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_prologue + +.rva .LSEH_body_sqr_mont_384x +.rva .LSEH_epilogue_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_body + +.rva .LSEH_epilogue_sqr_mont_384x +.rva .LSEH_end_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_epilogue + +.rva .LSEH_begin_mul_382x +.rva .LSEH_body_mul_382x +.rva .LSEH_info_mul_382x_prologue + +.rva .LSEH_body_mul_382x +.rva .LSEH_epilogue_mul_382x +.rva .LSEH_info_mul_382x_body + +.rva .LSEH_epilogue_mul_382x +.rva .LSEH_end_mul_382x +.rva .LSEH_info_mul_382x_epilogue + +.rva .LSEH_begin_sqr_382x +.rva .LSEH_body_sqr_382x +.rva .LSEH_info_sqr_382x_prologue + +.rva .LSEH_body_sqr_382x +.rva .LSEH_epilogue_sqr_382x +.rva .LSEH_info_sqr_382x_body + +.rva .LSEH_epilogue_sqr_382x +.rva .LSEH_end_sqr_382x +.rva .LSEH_info_sqr_382x_epilogue + +.rva .LSEH_begin_mul_384 +.rva .LSEH_body_mul_384 +.rva .LSEH_info_mul_384_prologue + +.rva .LSEH_body_mul_384 +.rva .LSEH_epilogue_mul_384 +.rva .LSEH_info_mul_384_body + +.rva .LSEH_epilogue_mul_384 +.rva .LSEH_end_mul_384 +.rva .LSEH_info_mul_384_epilogue + +.rva .LSEH_begin_sqr_384 +.rva .LSEH_body_sqr_384 +.rva .LSEH_info_sqr_384_prologue + +.rva .LSEH_body_sqr_384 +.rva .LSEH_epilogue_sqr_384 +.rva .LSEH_info_sqr_384_body + +.rva .LSEH_epilogue_sqr_384 +.rva .LSEH_end_sqr_384 +.rva .LSEH_info_sqr_384_epilogue + +.rva .LSEH_begin_sqr_mont_384 +.rva .LSEH_body_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_prologue + +.rva .LSEH_body_sqr_mont_384 +.rva .LSEH_epilogue_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_body + +.rva .LSEH_epilogue_sqr_mont_384 +.rva .LSEH_end_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_epilogue + +.rva .LSEH_begin_redc_mont_384 +.rva .LSEH_body_redc_mont_384 +.rva .LSEH_info_redc_mont_384_prologue + +.rva .LSEH_body_redc_mont_384 +.rva .LSEH_epilogue_redc_mont_384 +.rva .LSEH_info_redc_mont_384_body + +.rva .LSEH_epilogue_redc_mont_384 +.rva .LSEH_end_redc_mont_384 +.rva .LSEH_info_redc_mont_384_epilogue + +.rva .LSEH_begin_from_mont_384 +.rva .LSEH_body_from_mont_384 +.rva .LSEH_info_from_mont_384_prologue + +.rva .LSEH_body_from_mont_384 +.rva .LSEH_epilogue_from_mont_384 +.rva .LSEH_info_from_mont_384_body + +.rva .LSEH_epilogue_from_mont_384 +.rva .LSEH_end_from_mont_384 +.rva .LSEH_info_from_mont_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mont_384 +.rva .LSEH_body_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_prologue + +.rva .LSEH_body_sgn0_pty_mont_384 +.rva .LSEH_epilogue_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_body + +.rva .LSEH_epilogue_sgn0_pty_mont_384 +.rva .LSEH_end_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mont_384x +.rva .LSEH_body_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_prologue + +.rva .LSEH_body_sgn0_pty_mont_384x +.rva .LSEH_epilogue_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_body + +.rva .LSEH_epilogue_sgn0_pty_mont_384x +.rva .LSEH_end_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_epilogue + +.rva .LSEH_begin_mul_mont_384 +.rva .LSEH_body_mul_mont_384 +.rva .LSEH_info_mul_mont_384_prologue + +.rva .LSEH_body_mul_mont_384 +.rva .LSEH_epilogue_mul_mont_384 +.rva .LSEH_info_mul_mont_384_body + +.rva .LSEH_epilogue_mul_mont_384 +.rva .LSEH_end_mul_mont_384 +.rva .LSEH_info_mul_mont_384_epilogue + +.rva .LSEH_begin_sqr_n_mul_mont_384 +.rva .LSEH_body_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_prologue + +.rva .LSEH_body_sqr_n_mul_mont_384 +.rva .LSEH_epilogue_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_body + +.rva .LSEH_epilogue_sqr_n_mul_mont_384 +.rva .LSEH_end_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_epilogue + +.rva .LSEH_begin_sqr_n_mul_mont_383 +.rva .LSEH_body_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_prologue + +.rva .LSEH_body_sqr_n_mul_mont_383 +.rva .LSEH_epilogue_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_body + +.rva .LSEH_epilogue_sqr_n_mul_mont_383 +.rva .LSEH_end_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_epilogue + +.rva .LSEH_begin_sqr_mont_382x +.rva .LSEH_body_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_prologue + +.rva .LSEH_body_sqr_mont_382x +.rva .LSEH_epilogue_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_body + +.rva .LSEH_epilogue_sqr_mont_382x +.rva .LSEH_end_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mul_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x29,0x00 +.byte 0x00,0xe4,0x2a,0x00 +.byte 0x00,0xd4,0x2b,0x00 +.byte 0x00,0xc4,0x2c,0x00 +.byte 0x00,0x34,0x2d,0x00 +.byte 0x00,0x54,0x2e,0x00 +.byte 0x00,0x74,0x30,0x00 +.byte 0x00,0x64,0x31,0x00 +.byte 0x00,0x01,0x2f,0x00 +.LSEH_info_mul_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_mul_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_382x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_384_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_mul_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqr_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x0f,0x00 +.byte 0x00,0xe4,0x10,0x00 +.byte 0x00,0xd4,0x11,0x00 +.byte 0x00,0xc4,0x12,0x00 +.byte 0x00,0x34,0x13,0x00 +.byte 0x00,0x54,0x14,0x00 +.byte 0x00,0x74,0x16,0x00 +.byte 0x00,0x64,0x17,0x00 +.byte 0x00,0x01,0x15,0x00 +.LSEH_info_sqr_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redc_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redc_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redc_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_from_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_from_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_from_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0_pty_mont_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0_pty_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mul_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_n_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_n_mul_mont_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_n_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_n_mul_mont_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_n_mul_mont_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_n_mul_mont_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqr_mont_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqr_mont_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s new file mode 100644 index 00000000000..75c7e82bc1a --- /dev/null +++ b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s @@ -0,0 +1,784 @@ +.text + +.globl mulx_mont_sparse_256 + +.def mulx_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mulx_mont_sparse_256: + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mulx_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_sparse_256: + +.globl sqrx_mont_sparse_256 + +.def sqrx_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_sparse_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sqrx_mont_sparse_256: + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_sparse_256: +.def __mulx_mont_sparse_256; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 + +.globl fromx_mont_256 + +.def fromx_mont_256; .scl 2; .type 32; .endef +.p2align 5 +fromx_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_fromx_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_fromx_mont_256: + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_fromx_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_fromx_mont_256: + +.globl redcx_mont_256 + +.def redcx_mont_256; .scl 2; .type 32; .endef +.p2align 5 +redcx_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redcx_mont_256: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redcx_mont_256: + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redcx_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redcx_mont_256: +.def __mulx_by_1_mont_256; .scl 3; .type 32; .endef +.p2align 5 +__mulx_by_1_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_mulx_mont_sparse_256 +.rva .LSEH_body_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_prologue + +.rva .LSEH_body_mulx_mont_sparse_256 +.rva .LSEH_epilogue_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_body + +.rva .LSEH_epilogue_mulx_mont_sparse_256 +.rva .LSEH_end_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_epilogue + +.rva .LSEH_begin_sqrx_mont_sparse_256 +.rva .LSEH_body_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_prologue + +.rva .LSEH_body_sqrx_mont_sparse_256 +.rva .LSEH_epilogue_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_body + +.rva .LSEH_epilogue_sqrx_mont_sparse_256 +.rva .LSEH_end_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_epilogue + +.rva .LSEH_begin_fromx_mont_256 +.rva .LSEH_body_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_prologue + +.rva .LSEH_body_fromx_mont_256 +.rva .LSEH_epilogue_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_body + +.rva .LSEH_epilogue_fromx_mont_256 +.rva .LSEH_end_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_epilogue + +.rva .LSEH_begin_redcx_mont_256 +.rva .LSEH_body_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_prologue + +.rva .LSEH_body_redcx_mont_256 +.rva .LSEH_epilogue_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_body + +.rva .LSEH_epilogue_redcx_mont_256 +.rva .LSEH_end_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mulx_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_mulx_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_fromx_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_fromx_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_fromx_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redcx_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redcx_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redcx_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s new file mode 100644 index 00000000000..12306a7ff5c --- /dev/null +++ b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s @@ -0,0 +1,3559 @@ +.text + + + + + + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.globl mulx_mont_384x + +.def mulx_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $328,%rsp + +.LSEH_body_mulx_mont_384x: + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mulx_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_384x: +.globl sqrx_mont_384x + +.def sqrx_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqrx_mont_384x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqrx_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_384x: + +.globl mulx_382x + +.def mulx_382x; .scl 2; .type 32; .endef +.p2align 5 +mulx_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_mulx_382x: + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mulx_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_382x: +.globl sqrx_382x + +.def sqrx_382x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_sqrx_382x: + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_382x: +.globl mulx_384 + +.def mulx_384; .scl 2; .type 32; .endef +.p2align 5 +mulx_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +.LSEH_body_mulx_384: + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +.LSEH_epilogue_mulx_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_384: + +.def __mulx_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 + +.globl sqrx_384 + +.def sqrx_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_384: + movq %rcx,%rdi + movq %rdx,%rsi + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_sqrx_384: + + + call __sqrx_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_384: +.def __sqrx_384; .scl 3; .type 32; .endef +.p2align 5 +__sqrx_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 + + + + +.globl redcx_mont_384 + +.def redcx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +redcx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redcx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redcx_mont_384: + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redcx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redcx_mont_384: + + + + +.globl fromx_mont_384 + +.def fromx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +fromx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_fromx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_fromx_mont_384: + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_fromx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_fromx_mont_384: +.def __mulx_by_1_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_by_1_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 + + +.def __redc_tail_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__redc_tail_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sgn0x_pty_mont_384 + +.def sgn0x_pty_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0x_pty_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0x_pty_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0x_pty_mont_384: + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0x_pty_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0x_pty_mont_384: + +.globl sgn0x_pty_mont_384x + +.def sgn0x_pty_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0x_pty_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0x_pty_mont_384x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0x_pty_mont_384x: + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0x_pty_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0x_pty_mont_384x: +.globl mulx_mont_384 + +.def mulx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_mulx_mont_384: + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_mulx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_384: +.def __mulx_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 + + +.globl sqrx_mont_384 + +.def sqrx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_sqrx_mont_384: + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_sqrx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_384: + +.globl sqrx_n_mul_mont_384 + +.def sqrx_n_mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_n_mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_n_mul_mont_384: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +.LSEH_body_sqrx_n_mul_mont_384: + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 + + movq 48(%rsp),%r14 + + movq 56(%rsp),%r13 + + movq 64(%rsp),%r12 + + movq 72(%rsp),%rbx + + movq 80(%rsp),%rbp + + leaq 88(%rsp),%rsp + +.LSEH_epilogue_sqrx_n_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_n_mul_mont_384: + +.globl sqrx_n_mul_mont_383 + +.def sqrx_n_mul_mont_383; .scl 2; .type 32; .endef +.p2align 5 +sqrx_n_mul_mont_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_n_mul_mont_383: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +.LSEH_body_sqrx_n_mul_mont_383: + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 + + movq 48(%rsp),%r14 + + movq 56(%rsp),%r13 + + movq 64(%rsp),%r12 + + movq 72(%rsp),%rbx + + movq 80(%rsp),%rbp + + leaq 88(%rsp),%rsp + +.LSEH_epilogue_sqrx_n_mul_mont_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_n_mul_mont_383: +.def __mulx_mont_383_nonred; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_383_nonred: + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 + + +.globl sqrx_mont_382x + +.def sqrx_mont_382x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_382x: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqrx_mont_382x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqrx_mont_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_382x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_mulx_mont_384x +.rva .LSEH_body_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_prologue + +.rva .LSEH_body_mulx_mont_384x +.rva .LSEH_epilogue_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_body + +.rva .LSEH_epilogue_mulx_mont_384x +.rva .LSEH_end_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_epilogue + +.rva .LSEH_begin_sqrx_mont_384x +.rva .LSEH_body_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_prologue + +.rva .LSEH_body_sqrx_mont_384x +.rva .LSEH_epilogue_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_body + +.rva .LSEH_epilogue_sqrx_mont_384x +.rva .LSEH_end_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_epilogue + +.rva .LSEH_begin_mulx_382x +.rva .LSEH_body_mulx_382x +.rva .LSEH_info_mulx_382x_prologue + +.rva .LSEH_body_mulx_382x +.rva .LSEH_epilogue_mulx_382x +.rva .LSEH_info_mulx_382x_body + +.rva .LSEH_epilogue_mulx_382x +.rva .LSEH_end_mulx_382x +.rva .LSEH_info_mulx_382x_epilogue + +.rva .LSEH_begin_sqrx_382x +.rva .LSEH_body_sqrx_382x +.rva .LSEH_info_sqrx_382x_prologue + +.rva .LSEH_body_sqrx_382x +.rva .LSEH_epilogue_sqrx_382x +.rva .LSEH_info_sqrx_382x_body + +.rva .LSEH_epilogue_sqrx_382x +.rva .LSEH_end_sqrx_382x +.rva .LSEH_info_sqrx_382x_epilogue + +.rva .LSEH_begin_mulx_384 +.rva .LSEH_body_mulx_384 +.rva .LSEH_info_mulx_384_prologue + +.rva .LSEH_body_mulx_384 +.rva .LSEH_epilogue_mulx_384 +.rva .LSEH_info_mulx_384_body + +.rva .LSEH_epilogue_mulx_384 +.rva .LSEH_end_mulx_384 +.rva .LSEH_info_mulx_384_epilogue + +.rva .LSEH_begin_sqrx_384 +.rva .LSEH_body_sqrx_384 +.rva .LSEH_info_sqrx_384_prologue + +.rva .LSEH_body_sqrx_384 +.rva .LSEH_epilogue_sqrx_384 +.rva .LSEH_info_sqrx_384_body + +.rva .LSEH_epilogue_sqrx_384 +.rva .LSEH_end_sqrx_384 +.rva .LSEH_info_sqrx_384_epilogue + +.rva .LSEH_begin_redcx_mont_384 +.rva .LSEH_body_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_prologue + +.rva .LSEH_body_redcx_mont_384 +.rva .LSEH_epilogue_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_body + +.rva .LSEH_epilogue_redcx_mont_384 +.rva .LSEH_end_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_epilogue + +.rva .LSEH_begin_fromx_mont_384 +.rva .LSEH_body_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_prologue + +.rva .LSEH_body_fromx_mont_384 +.rva .LSEH_epilogue_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_body + +.rva .LSEH_epilogue_fromx_mont_384 +.rva .LSEH_end_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_epilogue + +.rva .LSEH_begin_sgn0x_pty_mont_384 +.rva .LSEH_body_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_prologue + +.rva .LSEH_body_sgn0x_pty_mont_384 +.rva .LSEH_epilogue_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_body + +.rva .LSEH_epilogue_sgn0x_pty_mont_384 +.rva .LSEH_end_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_epilogue + +.rva .LSEH_begin_sgn0x_pty_mont_384x +.rva .LSEH_body_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_prologue + +.rva .LSEH_body_sgn0x_pty_mont_384x +.rva .LSEH_epilogue_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_body + +.rva .LSEH_epilogue_sgn0x_pty_mont_384x +.rva .LSEH_end_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_epilogue + +.rva .LSEH_begin_mulx_mont_384 +.rva .LSEH_body_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_prologue + +.rva .LSEH_body_mulx_mont_384 +.rva .LSEH_epilogue_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_body + +.rva .LSEH_epilogue_mulx_mont_384 +.rva .LSEH_end_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_epilogue + +.rva .LSEH_begin_sqrx_mont_384 +.rva .LSEH_body_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_prologue + +.rva .LSEH_body_sqrx_mont_384 +.rva .LSEH_epilogue_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_body + +.rva .LSEH_epilogue_sqrx_mont_384 +.rva .LSEH_end_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_epilogue + +.rva .LSEH_begin_sqrx_n_mul_mont_384 +.rva .LSEH_body_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_prologue + +.rva .LSEH_body_sqrx_n_mul_mont_384 +.rva .LSEH_epilogue_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_body + +.rva .LSEH_epilogue_sqrx_n_mul_mont_384 +.rva .LSEH_end_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_epilogue + +.rva .LSEH_begin_sqrx_n_mul_mont_383 +.rva .LSEH_body_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_prologue + +.rva .LSEH_body_sqrx_n_mul_mont_383 +.rva .LSEH_epilogue_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_body + +.rva .LSEH_epilogue_sqrx_n_mul_mont_383 +.rva .LSEH_end_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_epilogue + +.rva .LSEH_begin_sqrx_mont_382x +.rva .LSEH_body_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_prologue + +.rva .LSEH_body_sqrx_mont_382x +.rva .LSEH_epilogue_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_body + +.rva .LSEH_epilogue_sqrx_mont_382x +.rva .LSEH_end_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mulx_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x29,0x00 +.byte 0x00,0xe4,0x2a,0x00 +.byte 0x00,0xd4,0x2b,0x00 +.byte 0x00,0xc4,0x2c,0x00 +.byte 0x00,0x34,0x2d,0x00 +.byte 0x00,0x54,0x2e,0x00 +.byte 0x00,0x74,0x30,0x00 +.byte 0x00,0x64,0x31,0x00 +.byte 0x00,0x01,0x2f,0x00 +.LSEH_info_mulx_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqrx_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_mulx_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_382x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x00,0x00 +.byte 0x00,0xe4,0x01,0x00 +.byte 0x00,0xd4,0x02,0x00 +.byte 0x00,0xc4,0x03,0x00 +.byte 0x00,0x34,0x04,0x00 +.byte 0x00,0x54,0x05,0x00 +.byte 0x00,0x74,0x07,0x00 +.byte 0x00,0x64,0x08,0x00 +.byte 0x00,0x52 +.byte 0x00,0x00 +.LSEH_info_mulx_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sqrx_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redcx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_redcx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_redcx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_fromx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_fromx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_fromx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0x_pty_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0x_pty_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0x_pty_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0x_pty_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sgn0x_pty_mont_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00 +.LSEH_info_sgn0x_pty_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_mulx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_mulx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00 +.LSEH_info_sqrx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_n_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_n_mul_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x05,0x00 +.byte 0x00,0xe4,0x06,0x00 +.byte 0x00,0xd4,0x07,0x00 +.byte 0x00,0xc4,0x08,0x00 +.byte 0x00,0x34,0x09,0x00 +.byte 0x00,0x54,0x0a,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00 +.LSEH_info_sqrx_n_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_n_mul_mont_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_n_mul_mont_383_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x05,0x00 +.byte 0x00,0xe4,0x06,0x00 +.byte 0x00,0xd4,0x07,0x00 +.byte 0x00,0xc4,0x08,0x00 +.byte 0x00,0x34,0x09,0x00 +.byte 0x00,0x54,0x0a,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00 +.LSEH_info_sqrx_n_mul_mont_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_sqrx_mont_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.LSEH_info_sqrx_mont_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/sha256-armv8.S b/crypto/blst_src/build/coff/sha256-armv8.S new file mode 100644 index 00000000000..a8bcbd3631b --- /dev/null +++ b/crypto/blst_src/build/coff/sha256-armv8.S @@ -0,0 +1,1087 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.text + +.p2align 6 + +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.p2align 2 +.globl blst_sha256_block_armv8 +.def blst_sha256_block_armv8; +.type 32; +.endef +.p2align 6 +blst_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +.globl blst_sha256_block_data_order +.def blst_sha256_block_data_order; +.type 32; +.endef +.p2align 4 +blst_sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.p2align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + +.globl blst_sha256_emit + +.def blst_sha256_emit; +.type 32; +.endef +.p2align 4 +blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; +.type 32; +.endef +.p2align 4 +blst_sha256_bcopy: +.Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,.Loop_bcopy + ret + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; +.type 32; +.endef +.p2align 4 +blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + diff --git a/crypto/blst_src/build/coff/sha256-portable-x86_64.s b/crypto/blst_src/build/coff/sha256-portable-x86_64.s new file mode 100644 index 00000000000..e499d107c70 --- /dev/null +++ b/crypto/blst_src/build/coff/sha256-portable-x86_64.s @@ -0,0 +1,1784 @@ +.text + +.globl blst_sha256_block_data_order +.def blst_sha256_block_data_order; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_block_data_order: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $64+24,%rsp + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) +.LSEH_body_blst_sha256_block_data_order: + + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.p2align 4 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp .Lrounds_16_xx +.p2align 4 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + leaq 64+24+48(%rsp),%r11 + + movq 64+24(%rsp),%r15 + + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + +.LSEH_epilogue_blst_sha256_block_data_order: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + leaq (%r11),%rsp + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order: + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_emit + +.def blst_sha256_emit; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_emit: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + bswapq %r8 + movq 24(%rdx),%r11 + bswapq %r9 + movl %r8d,4(%rcx) + bswapq %r10 + movl %r9d,12(%rcx) + bswapq %r11 + movl %r10d,20(%rcx) + shrq $32,%r8 + movl %r11d,28(%rcx) + shrq $32,%r9 + movl %r8d,0(%rcx) + shrq $32,%r10 + movl %r9d,8(%rcx) + shrq $32,%r11 + movl %r10d,16(%rcx) + movl %r11d,24(%rcx) + .byte 0xf3,0xc3 + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_bcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rdx,%rcx +.Loop_bcopy: + movzbl (%rdx),%eax + leaq 1(%rdx),%rdx + movb %al,-1(%rcx,%rdx,1) + decq %r8 + jnz .Loop_bcopy + .byte 0xf3,0xc3 + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_hcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_blst_sha256_block_data_order +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_prologue + +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_end_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_blst_sha256_block_data_order_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_blst_sha256_block_data_order_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x0b,0x00 +.byte 0x00,0xe4,0x0c,0x00 +.byte 0x00,0xd4,0x0d,0x00 +.byte 0x00,0xc4,0x0e,0x00 +.byte 0x00,0x54,0x0f,0x00 +.byte 0x00,0x34,0x10,0x00 +.byte 0x00,0x74,0x12,0x00 +.byte 0x00,0x64,0x13,0x00 +.byte 0x00,0x01,0x11,0x00 +.LSEH_info_blst_sha256_block_data_order_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x03 +.byte 0x00,0x00 + diff --git a/crypto/blst_src/build/coff/sha256-x86_64.s b/crypto/blst_src/build/coff/sha256-x86_64.s new file mode 100644 index 00000000000..ed28b781d4c --- /dev/null +++ b/crypto/blst_src/build/coff/sha256-x86_64.s @@ -0,0 +1,1560 @@ +.text + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_block_data_order_shaext + +.def blst_sha256_block_data_order_shaext; .scl 2; .type 32; .endef +.p2align 6 +blst_sha256_block_data_order_shaext: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order_shaext: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + subq $0x58,%rsp + + movaps %xmm6,-88(%r11) + + movaps %xmm7,-72(%r11) + + movaps %xmm8,-56(%r11) + + movaps %xmm9,-40(%r11) + + movaps %xmm10,-24(%r11) + +.LSEH_body_blst_sha256_block_data_order_shaext: + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.p2align 4 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + movaps -88(%r11),%xmm6 + movaps -72(%r11),%xmm7 + movaps -56(%r11),%xmm8 + movaps -40(%r11),%xmm9 + movaps -24(%r11),%xmm10 + movq %r11,%rsp + +.LSEH_epilogue_blst_sha256_block_data_order_shaext: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order_shaext: +.globl blst_sha256_block_data_order + +.def blst_sha256_block_data_order; .scl 2; .type 32; .endef +.p2align 6 +blst_sha256_block_data_order: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order: + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $104,%rsp + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movaps %xmm6,32(%rsp) + + movaps %xmm7,48(%rsp) + + movaps %xmm8,64(%rsp) + + movaps %xmm9,80(%rsp) + + movq %rsp,%rbp + +.LSEH_body_blst_sha256_block_data_order: + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.p2align 4 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.p2align 4 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + leaq 104+48(%rbp),%r11 + + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movaps 32(%rbp),%xmm6 + movaps 48(%rbp),%xmm7 + movaps 64(%rbp),%xmm8 + movaps 80(%rbp),%xmm9 + movq 104(%rbp),%r15 + + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbx + + movq -8(%r11),%rbp + +.LSEH_epilogue_blst_sha256_block_data_order: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + leaq (%r11),%rsp + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order: +.globl blst_sha256_emit + +.def blst_sha256_emit; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_emit: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + bswapq %r8 + movq 24(%rdx),%r11 + bswapq %r9 + movl %r8d,4(%rcx) + bswapq %r10 + movl %r9d,12(%rcx) + bswapq %r11 + movl %r10d,20(%rcx) + shrq $32,%r8 + movl %r11d,28(%rcx) + shrq $32,%r9 + movl %r8d,0(%rcx) + shrq $32,%r10 + movl %r9d,8(%rcx) + shrq $32,%r11 + movl %r10d,16(%rcx) + movl %r11d,24(%rcx) + .byte 0xf3,0xc3 + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_bcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rdx,%rcx +.Loop_bcopy: + movzbl (%rdx),%eax + leaq 1(%rdx),%rdx + movb %al,-1(%rcx,%rdx,1) + decq %r8 + jnz .Loop_bcopy + .byte 0xf3,0xc3 + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_hcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_blst_sha256_block_data_order_shaext +.rva .LSEH_body_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_prologue + +.rva .LSEH_body_blst_sha256_block_data_order_shaext +.rva .LSEH_epilogue_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order_shaext +.rva .LSEH_end_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_epilogue + +.rva .LSEH_begin_blst_sha256_block_data_order +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_prologue + +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_end_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_blst_sha256_block_data_order_shaext_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_blst_sha256_block_data_order_shaext_body: +.byte 1,0,15,0 +.byte 0x00,0x68,0x00,0x00 +.byte 0x00,0x78,0x01,0x00 +.byte 0x00,0x88,0x02,0x00 +.byte 0x00,0x98,0x03,0x00 +.byte 0x00,0xa8,0x04,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_blst_sha256_block_data_order_shaext_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x03 +.byte 0x00,0x00 + +.LSEH_info_blst_sha256_block_data_order_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0x03 +.byte 0,0 +.LSEH_info_blst_sha256_block_data_order_body: +.byte 1,0,26,5 +.byte 0x00,0x68,0x02,0x00 +.byte 0x00,0x78,0x03,0x00 +.byte 0x00,0x88,0x04,0x00 +.byte 0x00,0x98,0x05,0x00 +.byte 0x00,0xf4,0x0d,0x00 +.byte 0x00,0xe4,0x0e,0x00 +.byte 0x00,0xd4,0x0f,0x00 +.byte 0x00,0xc4,0x10,0x00 +.byte 0x00,0x34,0x11,0x00 +.byte 0x00,0x74,0x14,0x00 +.byte 0x00,0x64,0x15,0x00 +.byte 0x00,0x03 +.byte 0x00,0x01,0x12,0x00 +.byte 0x00,0x50 +.LSEH_info_blst_sha256_block_data_order_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x03 +.byte 0x00,0x00 + diff --git a/crypto/blst_src/build/elf/add_mod_256-armv8.S b/crypto/blst_src/build/elf/add_mod_256-armv8.S new file mode 100644 index 00000000000..57476aaa1da --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_256-armv8.S @@ -0,0 +1,379 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,.Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,.Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 diff --git a/crypto/blst_src/build/elf/add_mod_256-x86_64.s b/crypto/blst_src/build/elf/add_mod_256-x86_64.s new file mode 100644 index 00000000000..2f41781959c --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_256-x86_64.s @@ -0,0 +1,572 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,@function +.align 32 +add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_256,.-add_mod_256 + + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,@function +.align 32 +mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,@function +.align 32 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_256,.-__lshift_mod_256 + + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,@function +.align 32 +lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,@function +.align 32 +rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,@function +.align 32 +cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,@function +.align 32 +sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,@function +.align 32 +check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size check_mod_256,.-check_mod_256 + + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,@function +.align 32 +add_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,@function +.align 32 +sub_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/add_mod_384-armv8.S b/crypto/blst_src/build/elf/add_mod_384-armv8.S new file mode 100644 index 00000000000..5c18d7fe892 --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_384-armv8.S @@ -0,0 +1,1000 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_32 +.hidden vec_select_32 +.type vec_select_32,%function +.align 5 +vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_32,.-vec_select_32 +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,%function +.align 5 +vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,%function +.align 5 +vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,%function +.align 5 +vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,%function +.align 5 +vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,%function +.align 5 +vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret +.size vec_prefetch,.-vec_prefetch +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,%function +.align 5 +vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, .Loop_is_zero_done + +.Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, .Loop_is_zero + +.Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_zero_16x,.-vec_is_zero_16x +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,%function +.align 5 +vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +.Loop_is_equal: + sub x2, x2, #1 + cbz x2, .Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b .Loop_is_equal + nop + +.Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_equal_16x,.-vec_is_equal_16x diff --git a/crypto/blst_src/build/elf/add_mod_384-x86_64.s b/crypto/blst_src/build/elf/add_mod_384-x86_64.s new file mode 100644 index 00000000000..39eee6d1752 --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_384-x86_64.s @@ -0,0 +1,1907 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,@function +.align 32 +add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,@function +.align 32 +add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,@function +.align 32 +rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz .Loop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,@function +.align 32 +__rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,@function +.align 32 +div_by_2_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,@function +.align 32 +lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,@function +.align 32 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_384,.-__lshift_mod_384 + + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,@function +.align 32 +mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,@function +.align 32 +mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,@function +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,@function +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,@function +.align 32 +cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,@function +.align 32 +sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,@function +.align 32 +sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,@function +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,@function +.align 32 +sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,@function +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_32 +.hidden vec_select_32 +.type vec_select_32,@function +.align 32 +vec_select_32: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 16(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 16(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 16(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-16(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_32,.-vec_select_32 +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,@function +.align 32 +vec_select_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 24(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 24(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 24(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-24(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,@function +.align 32 +vec_select_96: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 48(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 48(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 48(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,@function +.align 32 +vec_select_192: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 96(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 96(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 96(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,@function +.align 32 +vec_select_144: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 72(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 72(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 72(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,@function +.align 32 +vec_select_288: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 144(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 144(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 144(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rdi) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rdi) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rdi) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rdi) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rdi) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rdi) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,@function +.align 32 +vec_prefetch: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rdi,%rsi,1),%rsi + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + prefetchnta (%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_prefetch,.-vec_prefetch +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,@function +.align 32 +vec_is_zero_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%esi + movdqu (%rdi),%xmm0 + leaq 16(%rdi),%rdi + +.Loop_is_zero: + decl %esi + jz .Loop_is_zero_done + movdqu (%rdi),%xmm1 + leaq 16(%rdi),%rdi + por %xmm1,%xmm0 + jmp .Loop_is_zero + +.Loop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %esi + testq %rax,%rax + cmovnzl %esi,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_is_zero_16x,.-vec_is_zero_16x +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,@function +.align 32 +vec_is_equal_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm1 + subq %rdi,%rsi + leaq 16(%rdi),%rdi + pxor %xmm1,%xmm0 + +.Loop_is_equal: + decl %edx + jz .Loop_is_equal_done + movdqu (%rdi),%xmm1 + movdqu (%rdi,%rsi,1),%xmm2 + leaq 16(%rdi),%rdi + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp .Loop_is_equal + +.Loop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_is_equal_16x,.-vec_is_equal_16x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s b/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s new file mode 100644 index 00000000000..084f3d8262d --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s @@ -0,0 +1,252 @@ +.text + +.type __add_mod_384x384,@function +.align 32 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,@function +.align 32 +add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,@function +.align 32 +sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S new file mode 100644 index 00000000000..347eb315f40 --- /dev/null +++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S @@ -0,0 +1,784 @@ +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + .inst 0xd503233f + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + .inst 0xd50323bf + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +.Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, .Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +.Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s new file mode 100644 index 00000000000..c4d8d6d3700 --- /dev/null +++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1185 @@ +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256,@function +.align 32 +ct_inverse_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1072,%rsp +.cfi_adjust_cfa_offset 1072 + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1072-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +.type __smulq_512x63,@function +.align 32 +__smulq_512x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,@function +.align 32 +__smulq_256x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256x63,.-__smulq_256x63 +.type __smulq_256_n_shift_by_31,@function +.align 32 +__smulq_256_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +.type __ab_approximation_31_256,@function +.align 32 +__ab_approximation_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +.type __inner_loop_31_256,@function +.align 32 +__inner_loop_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz .Loop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,@function +.align 32 +__inner_loop_62_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +.Loop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz .Loop_62_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62_256,.-__inner_loop_62_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S new file mode 100644 index 00000000000..d7eca17073c --- /dev/null +++ b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S @@ -0,0 +1,717 @@ +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl .Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +.Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +.Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S new file mode 100644 index 00000000000..3f1390ed9dc --- /dev/null +++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S @@ -0,0 +1,324 @@ +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the .Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, .Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret +.size __inner_loop_30,.-__inner_loop_30 +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s new file mode 100644 index 00000000000..fec1493cb12 --- /dev/null +++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,479 @@ +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384,@function +.align 32 +ct_is_square_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $536,%rsp +.cfi_adjust_cfa_offset 536 + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz .Loop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -536-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,@function +.align 32 +__smulq_384_n_shift_by_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +.type __ab_approximation_30,@function +.align 32 +__ab_approximation_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_30,.-__ab_approximation_30 +.type __inner_loop_30,@function +.align 32 +__inner_loop_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +.Loop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz .Loop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,@function +.align 32 +__inner_loop_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +.Loop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz .Loop_48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_48,.-__inner_loop_48 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..b702262f6e5 --- /dev/null +++ b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1195 @@ +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383,@function +.align 32 +ct_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +.type __smulq_767x63,@function +.align 32 +__smulq_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_767x63,.-__smulq_767x63 +.type __smulq_383x63,@function +.align 32 +__smulq_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383x63,.-__smulq_383x63 +.type __smulq_383_n_shift_by_62,@function +.align 32 +__smulq_383_n_shift_by_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +.type __ab_approximation_62,@function +.align 32 +__ab_approximation_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62,@function +.align 8 +.long 0 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +.Loop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..25a5fa5345f --- /dev/null +++ b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1574 @@ +.text + +.globl ctx_inverse_mod_383 +.type ctx_inverse_mod_383,@function +.align 32 +ctx_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +.type __smulx_767x63,@function +.align 32 +__smulx_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_767x63,.-__smulx_767x63 +.type __smulx_383x63,@function +.align 32 +__smulx_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383x63,.-__smulx_383x63 +.type __smulx_383_n_shift_by_31,@function +.align 32 +__smulx_383_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +.type __smulx_191_n_shift_by_31,@function +.align 32 +__smulx_191_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +.type __ab_approximation_31,@function +.align 32 +__ab_approximation_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31,.-__ab_approximation_31 +.type __inner_loop_31,@function +.align 32 +__inner_loop_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz .Loop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31,.-__inner_loop_31 + +.type __inner_loop_62,@function +.align 32 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +.Loop_62: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/div3w-armv8.S b/crypto/blst_src/build/elf/div3w-armv8.S new file mode 100644 index 00000000000..a2b1d676a36 --- /dev/null +++ b/crypto/blst_src/build/elf/div3w-armv8.S @@ -0,0 +1,88 @@ +.text + +.globl div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 diff --git a/crypto/blst_src/build/elf/div3w-x86_64.s b/crypto/blst_src/build/elf/div3w-x86_64.s new file mode 100644 index 00000000000..00ae5699824 --- /dev/null +++ b/crypto/blst_src/build/elf/div3w-x86_64.s @@ -0,0 +1,123 @@ +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,@function +.align 32 +div_3_limbs: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +.Loop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz .Loop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,@function +.align 32 +quot_rem_128: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_128,.-quot_rem_128 + + + + + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,@function +.align 32 +quot_rem_64: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_64,.-quot_rem_64 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/mul_mont_256-armv8.S b/crypto/blst_src/build/elf/mul_mont_256-armv8.S new file mode 100644 index 00000000000..8bb1197f464 --- /dev/null +++ b/crypto/blst_src/build/elf/mul_mont_256-armv8.S @@ -0,0 +1,464 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 diff --git a/crypto/blst_src/build/elf/mul_mont_384-armv8.S b/crypto/blst_src/build/elf/mul_mont_384-armv8.S new file mode 100644 index 00000000000..c048e816b85 --- /dev/null +++ b/crypto/blst_src/build/elf/mul_mont_384-armv8.S @@ -0,0 +1,2372 @@ +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +.Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,.Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __sqr_384,%function +.align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret +.size __sqr_384,.-__sqr_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x diff --git a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s new file mode 100644 index 00000000000..37abd4392d3 --- /dev/null +++ b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s @@ -0,0 +1,714 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,@function +.align 32 +mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,@function +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.type __mulq_mont_sparse_256,@function +.align 32 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,@function +.align 32 +from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,@function +.align 32 +redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +.type __mulq_by_1_mont_256,@function +.align 32 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s new file mode 100644 index 00000000000..fa9dd3529ad --- /dev/null +++ b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s @@ -0,0 +1,3620 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,@function +.align 32 +mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,@function +.align 32 +sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,@function +.align 32 +mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_382x,.-mul_382x +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,@function +.align 32 +sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_382x,.-sqr_382x +.globl mul_384 +.hidden mul_384 +.type mul_384,@function +.align 32 +mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,@function +.align 32 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_384,.-__mulq_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,@function +.align 32 +sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,@function +.align 32 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,@function +.align 32 +sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 + + + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,@function +.align 32 +redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + + + + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,@function +.align 32 +from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_384,.-from_mont_384 +.type __mulq_by_1_mont_384,@function +.align 32 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,@function +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,@function +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,@function +.align 32 +mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +.type __mulq_mont_384,@function +.align 32 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_384,.-__mulq_mont_384 +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,@function +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,@function +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __mulq_mont_383_nonred,@function +.align 32 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,@function +.align 32 +sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s new file mode 100644 index 00000000000..20a02073246 --- /dev/null +++ b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s @@ -0,0 +1,627 @@ +.text + +.globl mulx_mont_sparse_256 +.hidden mulx_mont_sparse_256 +.type mulx_mont_sparse_256,@function +.align 32 +mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 + +.globl sqrx_mont_sparse_256 +.hidden sqrx_mont_sparse_256 +.type sqrx_mont_sparse_256,@function +.align 32 +sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 +.type __mulx_mont_sparse_256,@function +.align 32 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 +.globl fromx_mont_256 +.hidden fromx_mont_256 +.type fromx_mont_256,@function +.align 32 +fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_256,.-fromx_mont_256 + +.globl redcx_mont_256 +.hidden redcx_mont_256 +.type redcx_mont_256,@function +.align 32 +redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_256,.-redcx_mont_256 +.type __mulx_by_1_mont_256,@function +.align 32 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s new file mode 100644 index 00000000000..9f9f7404ee4 --- /dev/null +++ b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s @@ -0,0 +1,2968 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mulx_mont_384x +.hidden mulx_mont_384x +.type mulx_mont_384x,@function +.align 32 +mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,@function +.align 32 +sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,@function +.align 32 +mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_382x,.-mulx_382x +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,@function +.align 32 +sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,@function +.align 32 +mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,@function +.align 32 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_384,.-__mulx_384 +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,@function +.align 32 +sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_384,.-sqrx_384 +.type __sqrx_384,@function +.align 32 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrx_384,.-__sqrx_384 + + + +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,@function +.align 32 +redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + + + + +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,@function +.align 32 +fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +.type __mulx_by_1_mont_384,@function +.align 32 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,@function +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,@function +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,@function +.align 32 +mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +.type __mulx_mont_384,@function +.align 32 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,@function +.align 32 +sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,@function +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,@function +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +.type __mulx_mont_383_nonred,@function +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,@function +.align 32 +sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/sha256-armv8.S b/crypto/blst_src/build/elf/sha256-armv8.S new file mode 100644 index 00000000000..7341decf4f5 --- /dev/null +++ b/crypto/blst_src/build/elf/sha256-armv8.S @@ -0,0 +1,1077 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.text + +.align 6 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.align 2 +.globl blst_sha256_block_armv8 +.type blst_sha256_block_armv8,%function +.align 6 +blst_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size blst_sha256_block_armv8,.-blst_sha256_block_armv8 +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,%function +.align 4 +blst_sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,%function +.align 4 +blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,%function +.align 4 +blst_sha256_bcopy: +.Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,.Loop_bcopy + ret +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,%function +.align 4 +blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret +.size blst_sha256_hcopy,.-blst_sha256_hcopy diff --git a/crypto/blst_src/build/elf/sha256-portable-x86_64.s b/crypto/blst_src/build/elf/sha256-portable-x86_64.s new file mode 100644 index 00000000000..20b5c411306 --- /dev/null +++ b/crypto/blst_src/build/elf/sha256-portable-x86_64.s @@ -0,0 +1,1754 @@ +.text + +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 16 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+24,%rsp +.cfi_adjust_cfa_offset 16*4+3*8 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + leaq 64+24+48(%rsp),%r11 +.cfi_def_cfa %r11,8 + movq 64+24(%rsp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/sha256-x86_64.s b/crypto/blst_src/build/elf/sha256-x86_64.s new file mode 100644 index 00000000000..47fdc5bc57a --- /dev/null +++ b/crypto/blst_src/build/elf/sha256-x86_64.s @@ -0,0 +1,1446 @@ +.text + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_block_data_order_shaext +.hidden blst_sha256_block_data_order_shaext +.type blst_sha256_block_data_order_shaext,@function +.align 64 +blst_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext +.globl blst_sha256_block_data_order +.hidden blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 64 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $40,%rsp +.cfi_adjust_cfa_offset 40 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + leaq 40+48(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq 40(%rbp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbx +.cfi_restore %rbx + movq -8(%r11),%rbp +.cfi_restore %rbp + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/mach-o/add_mod_256-armv8.S b/crypto/blst_src/build/mach-o/add_mod_256-armv8.S new file mode 100644 index 00000000000..198d65aef69 --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_256-armv8.S @@ -0,0 +1,379 @@ +.text + +.globl _add_mod_256 +.private_extern _add_mod_256 + +.align 5 +_add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl _mul_by_3_mod_256 +.private_extern _mul_by_3_mod_256 + +.align 5 +_mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl _lshift_mod_256 +.private_extern _lshift_mod_256 + +.align 5 +_lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl _rshift_mod_256 +.private_extern _rshift_mod_256 + +.align 5 +_rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl _cneg_mod_256 +.private_extern _cneg_mod_256 + +.align 5 +_cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret + + +.globl _sub_mod_256 +.private_extern _sub_mod_256 + +.align 5 +_sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + + +.globl _check_mod_256 +.private_extern _check_mod_256 + +.align 5 +_check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret + + +.globl _add_n_check_mod_256 +.private_extern _add_n_check_mod_256 + +.align 5 +_add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + + +.globl _sub_n_check_mod_256 +.private_extern _sub_n_check_mod_256 + +.align 5 +_sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + diff --git a/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s new file mode 100644 index 00000000000..19e5ba9834f --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s @@ -0,0 +1,564 @@ +.text + +.globl _add_mod_256 +.private_extern _add_mod_256 + +.p2align 5 +_add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_256 +.private_extern _mul_by_3_mod_256 + +.p2align 5 +_mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp L$oaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _lshift_mod_256 +.private_extern _lshift_mod_256 + +.p2align 5 +_lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz L$oop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _rshift_mod_256 +.private_extern _rshift_mod_256 + +.p2align 5 +_rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz L$oop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _cneg_mod_256 +.private_extern _cneg_mod_256 + +.p2align 5 +_cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_mod_256 +.private_extern _sub_mod_256 + +.p2align 5 +_sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _check_mod_256 +.private_extern _check_mod_256 + +.p2align 5 +_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _add_n_check_mod_256 +.private_extern _add_n_check_mod_256 + +.p2align 5 +_add_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_n_check_mod_256 +.private_extern _sub_n_check_mod_256 + +.p2align 5 +_sub_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/add_mod_384-armv8.S b/crypto/blst_src/build/mach-o/add_mod_384-armv8.S new file mode 100644 index 00000000000..a62995f2bed --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_384-armv8.S @@ -0,0 +1,1000 @@ +.text + +.globl _add_mod_384 +.private_extern _add_mod_384 + +.align 5 +_add_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl _add_mod_384x +.private_extern _add_mod_384x + +.align 5 +_add_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _rshift_mod_384 +.private_extern _rshift_mod_384 + +.align 5 +_rshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + + +.globl _div_by_2_mod_384 +.private_extern _div_by_2_mod_384 + +.align 5 +_div_by_2_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _lshift_mod_384 +.private_extern _lshift_mod_384 + +.align 5 +_lshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl _mul_by_3_mod_384 +.private_extern _mul_by_3_mod_384 + +.align 5 +_mul_by_3_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_8_mod_384 +.private_extern _mul_by_8_mod_384 + +.align 5 +_mul_by_8_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_3_mod_384x +.private_extern _mul_by_3_mod_384x + +.align 5 +_mul_by_3_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_8_mod_384x +.private_extern _mul_by_8_mod_384x + +.align 5 +_mul_by_8_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _cneg_mod_384 +.private_extern _cneg_mod_384 + +.align 5 +_cneg_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _sub_mod_384 +.private_extern _sub_mod_384 + +.align 5 +_sub_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + + +.globl _sub_mod_384x +.private_extern _sub_mod_384x + +.align 5 +_sub_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_1_plus_i_mod_384x +.private_extern _mul_by_1_plus_i_mod_384x + +.align 5 +_mul_by_1_plus_i_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _sgn0_pty_mod_384 +.private_extern _sgn0_pty_mod_384 + +.align 5 +_sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + + +.globl _sgn0_pty_mod_384x +.private_extern _sgn0_pty_mod_384x + +.align 5 +_sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + +.globl _vec_select_32 +.private_extern _vec_select_32 + +.align 5 +_vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_48 +.private_extern _vec_select_48 + +.align 5 +_vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_96 +.private_extern _vec_select_96 + +.align 5 +_vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_select_192 +.private_extern _vec_select_192 + +.align 5 +_vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_select_144 +.private_extern _vec_select_144 + +.align 5 +_vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_288 +.private_extern _vec_select_288 + +.align 5 +_vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_prefetch +.private_extern _vec_prefetch + +.align 5 +_vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret + +.globl _vec_is_zero_16x +.private_extern _vec_is_zero_16x + +.align 5 +_vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, Loop_is_zero_done + +Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, Loop_is_zero + +Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + +.globl _vec_is_equal_16x +.private_extern _vec_is_equal_16x + +.align 5 +_vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +Loop_is_equal: + sub x2, x2, #1 + cbz x2, Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b Loop_is_equal + nop + +Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + diff --git a/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s new file mode 100644 index 00000000000..974978e3425 --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s @@ -0,0 +1,1899 @@ +.text + +.globl _add_mod_384 +.private_extern _add_mod_384 + +.p2align 5 +_add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _add_mod_384x +.private_extern _add_mod_384x + +.p2align 5 +_add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _rshift_mod_384 +.private_extern _rshift_mod_384 + +.p2align 5 +_rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +L$oop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz L$oop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _div_by_2_mod_384 +.private_extern _div_by_2_mod_384 + +.p2align 5 +_div_by_2_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _lshift_mod_384 +.private_extern _lshift_mod_384 + +.p2align 5 +_lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +L$oop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz L$oop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_384 +.private_extern _mul_by_3_mod_384 + +.p2align 5 +_mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_8_mod_384 +.private_extern _mul_by_8_mod_384 + +.p2align 5 +_mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_384x +.private_extern _mul_by_3_mod_384x + +.p2align 5 +_mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_8_mod_384x +.private_extern _mul_by_8_mod_384x + +.p2align 5 +_mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _cneg_mod_384 +.private_extern _cneg_mod_384 + +.p2align 5 +_cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_mod_384 +.private_extern _sub_mod_384 + +.p2align 5 +_sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sub_mod_384x +.private_extern _sub_mod_384x + +.p2align 5 +_sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_by_1_plus_i_mod_384x +.private_extern _mul_by_1_plus_i_mod_384x + +.p2align 5 +_mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sgn0_pty_mod_384 +.private_extern _sgn0_pty_mod_384 + +.p2align 5 +_sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mod_384x +.private_extern _sgn0_pty_mod_384x + +.p2align 5 +_sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_32 +.private_extern _vec_select_32 + +.p2align 5 +_vec_select_32: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 16(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 16(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 16(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-16(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_48 +.private_extern _vec_select_48 + +.p2align 5 +_vec_select_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 24(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 24(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 24(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-24(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_96 +.private_extern _vec_select_96 + +.p2align 5 +_vec_select_96: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 48(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 48(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 48(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_192 +.private_extern _vec_select_192 + +.p2align 5 +_vec_select_192: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 96(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 96(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 96(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_144 +.private_extern _vec_select_144 + +.p2align 5 +_vec_select_144: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 72(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 72(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 72(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_288 +.private_extern _vec_select_288 + +.p2align 5 +_vec_select_288: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 144(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 144(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 144(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rdi) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rdi) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rdi) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rdi) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rdi) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rdi) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_prefetch +.private_extern _vec_prefetch + +.p2align 5 +_vec_prefetch: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rdi,%rsi,1),%rsi + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + prefetchnta (%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_is_zero_16x +.private_extern _vec_is_zero_16x + +.p2align 5 +_vec_is_zero_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%esi + movdqu (%rdi),%xmm0 + leaq 16(%rdi),%rdi + +L$oop_is_zero: + decl %esi + jz L$oop_is_zero_done + movdqu (%rdi),%xmm1 + leaq 16(%rdi),%rdi + por %xmm1,%xmm0 + jmp L$oop_is_zero + +L$oop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %esi + testq %rax,%rax + cmovnzl %esi,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_is_equal_16x +.private_extern _vec_is_equal_16x + +.p2align 5 +_vec_is_equal_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm1 + subq %rdi,%rsi + leaq 16(%rdi),%rdi + pxor %xmm1,%xmm0 + +L$oop_is_equal: + decl %edx + jz L$oop_is_equal_done + movdqu (%rdi),%xmm1 + movdqu (%rdi,%rsi,1),%xmm2 + leaq 16(%rdi),%rdi + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp L$oop_is_equal + +L$oop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s new file mode 100644 index 00000000000..2dc58f81608 --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s @@ -0,0 +1,244 @@ +.text + + +.p2align 5 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _add_mod_384x384 +.private_extern _add_mod_384x384 + +.p2align 5 +_add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sub_mod_384x384 +.private_extern _sub_mod_384x384 + +.p2align 5 +_sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S new file mode 100644 index 00000000000..f3a2c3b5f11 --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S @@ -0,0 +1,784 @@ +.text + +.globl _ct_inverse_mod_256 + +.align 5 +_ct_inverse_mod_256: +.long 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// + +.align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + + + +.align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + + + +.align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + + +.align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + + + +.align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + + + +.align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, Loop_62_256 + + ret + diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s new file mode 100644 index 00000000000..b6441da6e1f --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1177 @@ +.text + +.globl _ct_inverse_mod_256 + +.p2align 5 +_ct_inverse_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1072,%rsp +.cfi_adjust_cfa_offset 1072 + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1072-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_512x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__smulq_256x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_256_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +L$oop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz L$oop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__inner_loop_62_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +L$oop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz L$oop_62_256 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S new file mode 100644 index 00000000000..c7d9ba8488e --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S @@ -0,0 +1,717 @@ +.text + +.globl _ct_inverse_mod_383 + +.align 5 +_ct_inverse_mod_383: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... + +.align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + + + +.align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + + + +.align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + + +.align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + + +.align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, Loop_62 + + ret + diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S new file mode 100644 index 00000000000..b5c953d287a --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S @@ -0,0 +1,324 @@ +.text + +.globl _ct_is_square_mod_384 + +.align 5 +_ct_is_square_mod_384: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b Loop_is_square + +.align 4 +Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + + +.align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + + +.align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + + + +.align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + + +.align 4 +__inner_loop_48: +Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, Loop_48 + + ret + diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s new file mode 100644 index 00000000000..f2823941167 --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,471 @@ +.text + +.globl _ct_is_square_mod_384 + +.p2align 5 +_ct_is_square_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $536,%rsp +.cfi_adjust_cfa_offset 536 + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp L$oop_is_square + +.p2align 5 +L$oop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz L$oop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -536-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__smulq_384_n_shift_by_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +L$oop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz L$oop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__inner_loop_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +L$oop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz L$oop_48 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..185a876b87c --- /dev/null +++ b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1187 @@ +.text + +.globl _ct_inverse_mod_383 + +.p2align 5 +_ct_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_383_n_shift_by_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 3 +.long 0 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +L$oop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz L$oop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..3e05df3a4b3 --- /dev/null +++ b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1566 @@ +.text + +.globl _ctx_inverse_mod_383 + +.p2align 5 +_ctx_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_383_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_191_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +L$oop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz L$oop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +L$oop_62: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz L$oop_62 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/div3w-armv8.S b/crypto/blst_src/build/mach-o/div3w-armv8.S new file mode 100644 index 00000000000..5a5eb3a01d7 --- /dev/null +++ b/crypto/blst_src/build/mach-o/div3w-armv8.S @@ -0,0 +1,88 @@ +.text + +.globl _div_3_limbs + +.align 5 +_div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + +.globl _quot_rem_128 + +.align 5 +_quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + + +.globl _quot_rem_64 + +.align 5 +_quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + diff --git a/crypto/blst_src/build/mach-o/div3w-x86_64.s b/crypto/blst_src/build/mach-o/div3w-x86_64.s new file mode 100644 index 00000000000..8075571c87d --- /dev/null +++ b/crypto/blst_src/build/mach-o/div3w-x86_64.s @@ -0,0 +1,115 @@ +.text + +.globl _div_3_limbs +.private_extern _div_3_limbs + +.p2align 5 +_div_3_limbs: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +L$oop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz L$oop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _quot_rem_128 +.private_extern _quot_rem_128 + +.p2align 5 +_quot_rem_128: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + + +.globl _quot_rem_64 +.private_extern _quot_rem_64 + +.p2align 5 +_quot_rem_64: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S b/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S new file mode 100644 index 00000000000..4f506b58b0f --- /dev/null +++ b/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S @@ -0,0 +1,464 @@ +.text + +.globl _mul_mont_sparse_256 +.private_extern _mul_mont_sparse_256 + +.align 5 +_mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + +.globl _sqr_mont_sparse_256 +.private_extern _sqr_mont_sparse_256 + +.align 5 +_sqr_mont_sparse_256: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + +.globl _from_mont_256 +.private_extern _from_mont_256 + +.align 5 +_from_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.globl _redc_mont_256 +.private_extern _redc_mont_256 + +.align 5 +_redc_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + + +.align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + diff --git a/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S b/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S new file mode 100644 index 00000000000..5aa2e9f3ae7 --- /dev/null +++ b/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S @@ -0,0 +1,2372 @@ +.text + +.globl _add_mod_384x384 + +.align 5 +_add_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + + +.align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret + + +.globl _sub_mod_384x384 + +.align 5 +_sub_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + + +.align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + + + +.align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + + +.align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl _mul_mont_384x +.private_extern _mul_mont_384x + +.align 5 +_mul_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_mont_384x +.private_extern _sqr_mont_384x + +.align 5 +_sqr_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _mul_mont_384 +.private_extern _mul_mont_384 + +.align 5 +_mul_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret + + +.globl _sqr_mont_384 +.private_extern _sqr_mont_384 + +.align 5 +_sqr_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_n_mul_mont_383 +.private_extern _sqr_n_mul_mont_383 + +.align 5 +_sqr_n_mul_mont_383: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + +.globl _sqr_384 +.private_extern _sqr_384 + +.align 5 +_sqr_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _redc_mont_384 +.private_extern _redc_mont_384 + +.align 5 +_redc_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _from_mont_384 +.private_extern _from_mont_384 + +.align 5 +_from_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + + + +.align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl _mul_384 +.private_extern _mul_384 + +.align 5 +_mul_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + + +.globl _mul_382x +.private_extern _mul_382x + +.align 5 +_mul_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // _mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // _mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // _mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_382x +.private_extern _sqr_382x + +.align 5 +_sqr_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // _mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // _mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_mont_382x +.private_extern _sqr_mont_382x + +.align 5 +_sqr_mont_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // _mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // _mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + + +.globl _sgn0_pty_mont_384 +.private_extern _sgn0_pty_mont_384 + +.align 5 +_sgn0_pty_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sgn0_pty_mont_384x +.private_extern _sgn0_pty_mont_384x + +.align 5 +_sgn0_pty_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + diff --git a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s new file mode 100644 index 00000000000..d83f5440342 --- /dev/null +++ b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s @@ -0,0 +1,706 @@ +.text + +.globl _mul_mont_sparse_256 +.private_extern _mul_mont_sparse_256 + +.p2align 5 +_mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_mont_sparse_256 +.private_extern _sqr_mont_sparse_256 + +.p2align 5 +_sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _from_mont_256 +.private_extern _from_mont_256 + +.p2align 5 +_from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _redc_mont_256 +.private_extern _redc_mont_256 + +.p2align 5 +_redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s new file mode 100644 index 00000000000..0d8ac89cfc2 --- /dev/null +++ b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s @@ -0,0 +1,3612 @@ +.text + + + + + + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_mont_384x +.private_extern _mul_mont_384x + +.p2align 5 +_mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_mont_384x +.private_extern _sqr_mont_384x + +.p2align 5 +_sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_382x +.private_extern _mul_382x + +.p2align 5 +_mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_382x +.private_extern _sqr_382x + +.p2align 5 +_sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_384 +.private_extern _mul_384 + +.p2align 5 +_mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_384 +.private_extern _sqr_384 + +.p2align 5 +_sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_mont_384 +.private_extern _sqr_mont_384 + +.p2align 5 +_sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + +.globl _redc_mont_384 +.private_extern _redc_mont_384 + +.p2align 5 +_redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + +.globl _from_mont_384 +.private_extern _from_mont_384 + +.p2align 5 +_from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mont_384 +.private_extern _sgn0_pty_mont_384 + +.p2align 5 +_sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mont_384x +.private_extern _sgn0_pty_mont_384x + +.p2align 5 +_sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_mont_384 +.private_extern _mul_mont_384 + +.p2align 5 +_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_n_mul_mont_384 +.private_extern _sqr_n_mul_mont_384 + +.p2align 5 +_sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +L$oop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz L$oop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_n_mul_mont_383 +.private_extern _sqr_n_mul_mont_383 + +.p2align 5 +_sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +L$oop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz L$oop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_mont_382x +.private_extern _sqr_mont_382x + +.p2align 5 +_sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s new file mode 100644 index 00000000000..178372f41b2 --- /dev/null +++ b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s @@ -0,0 +1,619 @@ +.text + +.globl _mulx_mont_sparse_256 +.private_extern _mulx_mont_sparse_256 + +.p2align 5 +_mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_mont_sparse_256 +.private_extern _sqrx_mont_sparse_256 + +.p2align 5 +_sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _fromx_mont_256 +.private_extern _fromx_mont_256 + +.p2align 5 +_fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _redcx_mont_256 +.private_extern _redcx_mont_256 + +.p2align 5 +_redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s new file mode 100644 index 00000000000..95d3dadcc67 --- /dev/null +++ b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s @@ -0,0 +1,2960 @@ +.text + + + + + + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_mont_384x +.private_extern _mulx_mont_384x + +.p2align 5 +_mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_384x +.private_extern _sqrx_mont_384x + +.p2align 5 +_sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mulx_382x +.private_extern _mulx_382x + +.p2align 5 +_mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_382x +.private_extern _sqrx_382x + +.p2align 5 +_sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_384 +.private_extern _mulx_384 + +.p2align 5 +_mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_384 +.private_extern _sqrx_384 + +.p2align 5 +_sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + + +.globl _redcx_mont_384 +.private_extern _redcx_mont_384 + +.p2align 5 +_redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + +.globl _fromx_mont_384 +.private_extern _fromx_mont_384 + +.p2align 5 +_fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0x_pty_mont_384 +.private_extern _sgn0x_pty_mont_384 + +.p2align 5 +_sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0x_pty_mont_384x +.private_extern _sgn0x_pty_mont_384x + +.p2align 5 +_sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_mont_384 +.private_extern _mulx_mont_384 + +.p2align 5 +_mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_384 +.private_extern _sqrx_mont_384 + +.p2align 5 +_sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_n_mul_mont_384 +.private_extern _sqrx_n_mul_mont_384 + +.p2align 5 +_sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +L$oop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz L$oop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_n_mul_mont_383 +.private_extern _sqrx_n_mul_mont_383 + +.p2align 5 +_sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +L$oop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz L$oop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_382x +.private_extern _sqrx_mont_382x + +.p2align 5 +_sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/sha256-armv8.S b/crypto/blst_src/build/mach-o/sha256-armv8.S new file mode 100644 index 00000000000..c928f75025f --- /dev/null +++ b/crypto/blst_src/build/mach-o/sha256-armv8.S @@ -0,0 +1,1077 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.text + +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.align 2 +.globl _blst_sha256_block_armv8 + +.align 6 +_blst_sha256_block_armv8: +Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,LK256 + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +.globl _blst_sha256_block_data_order + +.align 4 +_blst_sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b L_00_48 + +.align 4 +L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.align 4 +_blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.align 4 +_blst_sha256_bcopy: +Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,Loop_bcopy + ret + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.align 4 +_blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + diff --git a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s new file mode 100644 index 00000000000..3f000720d00 --- /dev/null +++ b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s @@ -0,0 +1,1746 @@ +.text + +.globl _blst_sha256_block_data_order + +.p2align 4 +_blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+24,%rsp +.cfi_adjust_cfa_offset 16*4+3*8 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp L$loop + +.p2align 4 +L$loop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp L$rounds_16_xx +.p2align 4 +L$rounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz L$rounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop + + leaq 64+24+48(%rsp),%r11 +.cfi_def_cfa %r11,8 + movq 64+24(%rsp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.p2align 4 +_blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.p2align 4 +_blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +L$oop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz L$oop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.p2align 4 +_blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/sha256-x86_64.s b/crypto/blst_src/build/mach-o/sha256-x86_64.s new file mode 100644 index 00000000000..dee75e35362 --- /dev/null +++ b/crypto/blst_src/build/mach-o/sha256-x86_64.s @@ -0,0 +1,1438 @@ +.text + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl _blst_sha256_block_data_order_shaext +.private_extern _blst_sha256_block_data_order_shaext + +.p2align 6 +_blst_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz L$oop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _blst_sha256_block_data_order +.private_extern _blst_sha256_block_data_order + +.p2align 6 +_blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $40,%rsp +.cfi_adjust_cfa_offset 40 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp L$loop_ssse3 +.p2align 4 +L$loop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$ssse3_00_47 + +.p2align 4 +L$ssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne L$ssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_ssse3 + + xorps %xmm0,%xmm0 + leaq 40+48(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq 40(%rbp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbx +.cfi_restore %rbx + movq -8(%r11),%rbp +.cfi_restore %rbp + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.p2align 4 +_blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.p2align 4 +_blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +L$oop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz L$oop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.p2align 4 +_blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/refresh.sh b/crypto/blst_src/build/refresh.sh new file mode 100755 index 00000000000..e8c8137c287 --- /dev/null +++ b/crypto/blst_src/build/refresh.sh @@ -0,0 +1,49 @@ +#!/bin/sh + +HERE=`dirname $0` +cd "${HERE}" + +PERL=${PERL:-perl} + +for pl in ../src/asm/*-x86_64.pl; do + s=`basename $pl .pl`.asm + expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s) + s=`basename $pl .pl`.s + (set -x; ${PERL} $pl elf > elf/$s) + (set -x; ${PERL} $pl mingw64 > coff/$s) + (set -x; ${PERL} $pl macosx > mach-o/$s) +done + +for pl in ../src/asm/*-armv8.pl; do + s=`basename $pl .pl`.asm + (set -x; ${PERL} $pl win64 > win64/$s) + s=`basename $pl .pl`.S + (set -x; ${PERL} $pl linux64 > elf/$s) + (set -x; ${PERL} $pl coff64 > coff/$s) + (set -x; ${PERL} $pl ios64 > mach-o/$s) +done + +( cd ../bindings; + echo "LIBRARY blst" + echo + echo "EXPORTS" + cc -E blst.h | \ + ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) && print "\t$1\n" }' + echo +) > win64/blst.def + +if which bindgen > /dev/null 2>&1; then + ( cd ../bindings; set -x; + bindgen --opaque-type blst_pairing \ + --opaque-type blst_uniq \ + --with-derive-default \ + --with-derive-eq \ + --size_t-is-usize \ + --rustified-enum BLST.\* \ + blst.h -- -D__BLST_RUST_BINDGEN__ \ + | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs + ) +else + echo "Install Rust bindgen with 'cargo install bindgen'" 1>&2 + exit 1 +fi diff --git a/crypto/blst_src/build/win64/add_mod_256-armv8.asm b/crypto/blst_src/build/win64/add_mod_256-armv8.asm new file mode 100644 index 00000000000..8d6975185a6 --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_256-armv8.asm @@ -0,0 +1,380 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |add_mod_256|[FUNC] + ALIGN 32 +|add_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + stp x8,x9,[x0] + csello x11,x11,x2 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |mul_by_3_mod_256|[FUNC] + ALIGN 32 +|mul_by_3_mod_256| PROC + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + csello x11,x11,x2 + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + stp x8,x9,[x0] + csello x11,x11,x2 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |lshift_mod_256|[FUNC] + ALIGN 32 +|lshift_mod_256| PROC + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +|$Loop_lshift_mod_256| + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x12 + csello x9,x9,x13 + csello x10,x10,x14 + csello x11,x11,x15 + + cbnz x2,|$Loop_lshift_mod_256| + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |rshift_mod_256|[FUNC] + ALIGN 32 +|rshift_mod_256| PROC + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +|$Loop_rshift| + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + cselne x12,x12,x8 + cselne x13,x13,x9 + cselne x14,x14,x10 + cselne x15,x15,x11 + cselne x3,x3,xzr + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,|$Loop_rshift| + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |cneg_mod_256|[FUNC] + ALIGN 32 +|cneg_mod_256| PROC + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetmne x3 + ands x2,x2,x3 + + cseleq x8,x8,x12 + cseleq x9,x9,x13 + cseleq x10,x10,x14 + stp x8,x9,[x0] + cseleq x11,x11,x15 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |sub_mod_256|[FUNC] + ALIGN 32 +|sub_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |check_mod_256|[FUNC] + ALIGN 32 +|check_mod_256| PROC + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + cselne x0,x0,xzr + and x0,x0,x1 + + ret + ENDP + + + + EXPORT |add_n_check_mod_256|[FUNC] + ALIGN 32 +|add_n_check_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + csello x11,x11,x2 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + cselne x0,x17,xzr + + ret + ENDP + + + + EXPORT |sub_n_check_mod_256|[FUNC] + ALIGN 32 +|sub_n_check_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + cselne x0,x17,xzr + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm new file mode 100644 index 00000000000..09a5c17975d --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm @@ -0,0 +1,934 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC add_mod_256 + + +ALIGN 32 +add_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_add_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oaded_a_add_mod_256:: + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + mov rax,r8 + adc r10,QWORD PTR[16+rdx] + mov rsi,r9 + adc r11,QWORD PTR[24+rdx] + sbb rdx,rdx + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb rdx,0 + + cmovc r8,rax + cmovc r9,rsi + mov QWORD PTR[rdi],r8 + cmovc r10,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_add_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_256:: +add_mod_256 ENDP + + +PUBLIC mul_by_3_mod_256 + + +ALIGN 32 +mul_by_3_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_mul_by_3_mod_256:: + + + mov rcx,rdx + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov rdx,rsi + mov r11,QWORD PTR[24+rsi] + + call __lshift_mod_256 + mov r12,QWORD PTR[rsp] + + jmp $L$oaded_a_add_mod_256 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_mul_by_3_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_256:: +mul_by_3_mod_256 ENDP + + +ALIGN 32 +__lshift_mod_256 PROC PRIVATE + DB 243,15,30,250 + add r8,r8 + adc r9,r9 + mov rax,r8 + adc r10,r10 + mov rsi,r9 + adc r11,r11 + sbb r12,r12 + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb r12,0 + + cmovc r8,rax + cmovc r9,rsi + cmovc r10,rbx + cmovc r11,rbp + + DB 0F3h,0C3h ;repret +__lshift_mod_256 ENDP + + +PUBLIC lshift_mod_256 + + +ALIGN 32 +lshift_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_lshift_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_lshift_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oop_lshift_mod_256:: + call __lshift_mod_256 + dec edx + jnz $L$oop_lshift_mod_256 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_lshift_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_lshift_mod_256:: +lshift_mod_256 ENDP + + +PUBLIC rshift_mod_256 + + +ALIGN 32 +rshift_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_rshift_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_rshift_mod_256:: + + + mov rbp,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oop_rshift_mod_256:: + mov r8,rbp + and rbp,1 + mov rax,QWORD PTR[rcx] + neg rbp + mov rsi,QWORD PTR[8+rcx] + mov rbx,QWORD PTR[16+rcx] + + and rax,rbp + and rsi,rbp + and rbx,rbp + and rbp,QWORD PTR[24+rcx] + + add r8,rax + adc r9,rsi + adc r10,rbx + adc r11,rbp + sbb rax,rax + + shr r8,1 + mov rbp,r9 + shr r9,1 + mov rbx,r10 + shr r10,1 + mov rsi,r11 + shr r11,1 + + shl rbp,63 + shl rbx,63 + or rbp,r8 + shl rsi,63 + or r9,rbx + shl rax,63 + or r10,rsi + or r11,rax + + dec edx + jnz $L$oop_rshift_mod_256 + + mov QWORD PTR[rdi],rbp + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_rshift_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_rshift_mod_256:: +rshift_mod_256 ENDP + + +PUBLIC cneg_mod_256 + + +ALIGN 32 +cneg_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_cneg_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_cneg_mod_256:: + + + mov r12,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r8,r12 + mov r11,QWORD PTR[24+rsi] + or r12,r9 + or r12,r10 + or r12,r11 + mov rbp,-1 + + mov rax,QWORD PTR[rcx] + cmovnz r12,rbp + mov rsi,QWORD PTR[8+rcx] + mov rbx,QWORD PTR[16+rcx] + and rax,r12 + mov rbp,QWORD PTR[24+rcx] + and rsi,r12 + and rbx,r12 + and rbp,r12 + + sub rax,r8 + sbb rsi,r9 + sbb rbx,r10 + sbb rbp,r11 + + or rdx,rdx + + cmovz rax,r8 + cmovz rsi,r9 + mov QWORD PTR[rdi],rax + cmovz rbx,r10 + mov QWORD PTR[8+rdi],rsi + cmovz rbp,r11 + mov QWORD PTR[16+rdi],rbx + mov QWORD PTR[24+rdi],rbp + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_cneg_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_cneg_mod_256:: +cneg_mod_256 ENDP + + +PUBLIC sub_mod_256 + + +ALIGN 32 +sub_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_sub_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + sub r8,QWORD PTR[rdx] + mov rax,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov rsi,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[24+rcx] + sbb rdx,rdx + + and rax,rdx + and rsi,rdx + and rbx,rdx + and rbp,rdx + + add r8,rax + adc r9,rsi + mov QWORD PTR[rdi],r8 + adc r10,rbx + mov QWORD PTR[8+rdi],r9 + adc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sub_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_256:: +sub_mod_256 ENDP + + +PUBLIC check_mod_256 + + +ALIGN 32 +check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_check_mod_256:: + mov rdi,rcx + mov rsi,rdx + + + + mov rax,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + + mov r8,rax + or rax,r9 + or rax,r10 + or rax,r11 + + sub r8,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rsi,rsi + + mov rdx,1 + cmp rax,0 + cmovne rax,rdx + and rax,rsi +$L$SEH_epilogue_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_check_mod_256:: +check_mod_256 ENDP + + +PUBLIC add_n_check_mod_256 + + +ALIGN 32 +add_n_check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_n_check_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_add_n_check_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + mov rax,r8 + adc r10,QWORD PTR[16+rdx] + mov rsi,r9 + adc r11,QWORD PTR[24+rdx] + sbb rdx,rdx + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb rdx,0 + + cmovc r8,rax + cmovc r9,rsi + mov QWORD PTR[rdi],r8 + cmovc r10,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + or r8,r9 + or r10,r11 + or r8,r10 + mov rax,1 + cmovz rax,r8 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_add_n_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_n_check_mod_256:: +add_n_check_mod_256 ENDP + + +PUBLIC sub_n_check_mod_256 + + +ALIGN 32 +sub_n_check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_n_check_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_sub_n_check_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + sub r8,QWORD PTR[rdx] + mov rax,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov rsi,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[24+rcx] + sbb rdx,rdx + + and rax,rdx + and rsi,rdx + and rbx,rdx + and rbp,rdx + + add r8,rax + adc r9,rsi + mov QWORD PTR[rdi],r8 + adc r10,rbx + mov QWORD PTR[8+rdi],r9 + adc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + or r8,r9 + or r10,r11 + or r8,r10 + mov rax,1 + cmovz rax,r8 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sub_n_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_n_check_mod_256:: +sub_n_check_mod_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_256 + DD imagerel $L$SEH_body_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_prologue + + DD imagerel $L$SEH_body_add_mod_256 + DD imagerel $L$SEH_epilogue_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_body + + DD imagerel $L$SEH_epilogue_add_mod_256 + DD imagerel $L$SEH_end_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_256 + DD imagerel $L$SEH_body_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_256 + DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 + DD imagerel $L$SEH_end_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_epilogue + + DD imagerel $L$SEH_begin_lshift_mod_256 + DD imagerel $L$SEH_body_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_prologue + + DD imagerel $L$SEH_body_lshift_mod_256 + DD imagerel $L$SEH_epilogue_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_body + + DD imagerel $L$SEH_epilogue_lshift_mod_256 + DD imagerel $L$SEH_end_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_epilogue + + DD imagerel $L$SEH_begin_rshift_mod_256 + DD imagerel $L$SEH_body_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_prologue + + DD imagerel $L$SEH_body_rshift_mod_256 + DD imagerel $L$SEH_epilogue_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_body + + DD imagerel $L$SEH_epilogue_rshift_mod_256 + DD imagerel $L$SEH_end_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_epilogue + + DD imagerel $L$SEH_begin_cneg_mod_256 + DD imagerel $L$SEH_body_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_prologue + + DD imagerel $L$SEH_body_cneg_mod_256 + DD imagerel $L$SEH_epilogue_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_body + + DD imagerel $L$SEH_epilogue_cneg_mod_256 + DD imagerel $L$SEH_end_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_epilogue + + DD imagerel $L$SEH_begin_sub_mod_256 + DD imagerel $L$SEH_body_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_prologue + + DD imagerel $L$SEH_body_sub_mod_256 + DD imagerel $L$SEH_epilogue_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_body + + DD imagerel $L$SEH_epilogue_sub_mod_256 + DD imagerel $L$SEH_end_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_epilogue + + DD imagerel $L$SEH_epilogue_check_mod_256 + DD imagerel $L$SEH_end_check_mod_256 + DD imagerel $L$SEH_info_check_mod_256_epilogue + + DD imagerel $L$SEH_begin_add_n_check_mod_256 + DD imagerel $L$SEH_body_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_prologue + + DD imagerel $L$SEH_body_add_n_check_mod_256 + DD imagerel $L$SEH_epilogue_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_body + + DD imagerel $L$SEH_epilogue_add_n_check_mod_256 + DD imagerel $L$SEH_end_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_epilogue + + DD imagerel $L$SEH_begin_sub_n_check_mod_256 + DD imagerel $L$SEH_body_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_prologue + + DD imagerel $L$SEH_body_sub_n_check_mod_256 + DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_body + + DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 + DD imagerel $L$SEH_end_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_add_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_mul_by_3_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_lshift_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_lshift_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_lshift_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_rshift_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_rshift_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_rshift_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_cneg_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_cneg_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_cneg_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_sub_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_add_n_check_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_n_check_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_add_n_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_n_check_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_n_check_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_sub_n_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/add_mod_384-armv8.asm b/crypto/blst_src/build/win64/add_mod_384-armv8.asm new file mode 100644 index 00000000000..4bf703a6da0 --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_384-armv8.asm @@ -0,0 +1,1001 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |add_mod_384|[FUNC] + ALIGN 32 +|add_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__add_mod_384| PROC + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +|__add_mod_384_ab_are_loaded| + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csello x10,x10,x16 + csello x11,x11,x17 + csello x12,x12,x19 + csello x13,x13,x20 + csello x14,x14,x21 + csello x15,x15,x22 + + ret + ENDP + + + + EXPORT |add_mod_384x|[FUNC] + ALIGN 32 +|add_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |rshift_mod_384|[FUNC] + ALIGN 32 +|rshift_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +|$Loop_rshift_mod_384| + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,|$Loop_rshift_mod_384| + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__rshift_mod_384| PROC + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + ENDP + + + + EXPORT |div_by_2_mod_384|[FUNC] + ALIGN 32 +|div_by_2_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |lshift_mod_384|[FUNC] + ALIGN 32 +|lshift_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +|$Loop_lshift_mod_384| + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,|$Loop_lshift_mod_384| + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__lshift_mod_384| PROC + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csello x10,x10,x16 + csello x11,x11,x17 + csello x12,x12,x19 + csello x13,x13,x20 + csello x14,x14,x21 + csello x15,x15,x22 + + ret + ENDP + + + + EXPORT |mul_by_3_mod_384|[FUNC] + ALIGN 32 +|mul_by_3_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_8_mod_384|[FUNC] + ALIGN 32 +|mul_by_8_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_3_mod_384x|[FUNC] + ALIGN 32 +|mul_by_3_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_8_mod_384x|[FUNC] + ALIGN 32 +|mul_by_8_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |cneg_mod_384|[FUNC] + ALIGN 32 +|cneg_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetmne x3 + ands x2,x2,x3 + + cseleq x10,x10,x16 + cseleq x11,x11,x17 + cseleq x12,x12,x19 + cseleq x13,x13,x20 + stp x10,x11,[x0] + cseleq x14,x14,x21 + stp x12,x13,[x0,#16] + cseleq x15,x15,x22 + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sub_mod_384|[FUNC] + ALIGN 32 +|sub_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__sub_mod_384| PROC + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + ENDP + + + + EXPORT |sub_mod_384x|[FUNC] + ALIGN 32 +|sub_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_1_plus_i_mod_384x|[FUNC] + ALIGN 32 +|mul_by_1_plus_i_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sgn0_pty_mod_384|[FUNC] + ALIGN 32 +|sgn0_pty_mod_384| PROC + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + ENDP + + + + EXPORT |sgn0_pty_mod_384x|[FUNC] + ALIGN 32 +|sgn0_pty_mod_384x| PROC + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + cseleq x3,x0,x2 + + cmp x1,#0 + cselne x1,x0,x2 + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + ENDP + + + EXPORT |vec_select_32|[FUNC] + ALIGN 32 +|vec_select_32| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_48|[FUNC] + ALIGN 32 +|vec_select_48| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_96|[FUNC] + ALIGN 32 +|vec_select_96| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_192|[FUNC] + ALIGN 32 +|vec_select_192| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_144|[FUNC] + ALIGN 32 +|vec_select_144| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_288|[FUNC] + ALIGN 32 +|vec_select_288| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_prefetch|[FUNC] + ALIGN 32 +|vec_prefetch| PROC + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + prfm pldl1keep, [x0] + ret + ENDP + + + EXPORT |vec_is_zero_16x|[FUNC] + ALIGN 32 +|vec_is_zero_16x| PROC + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, |$Loop_is_zero_done| + +|$Loop_is_zero| + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, |$Loop_is_zero| + +|$Loop_is_zero_done| + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + cseleq x0,x0,xzr + ret + ENDP + + + EXPORT |vec_is_equal_16x|[FUNC] + ALIGN 32 +|vec_is_equal_16x| PROC + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +|$Loop_is_equal| + sub x2, x2, #1 + cbz x2, |$Loop_is_equal_done| + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b |$Loop_is_equal| + nop + +|$Loop_is_equal_done| + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + cseleq x0,x0,xzr + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm new file mode 100644 index 00000000000..8a7b9e255db --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm @@ -0,0 +1,2504 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC add_mod_384 + + +ALIGN 32 +add_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_add_mod_384:: + + + call __add_mod_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_add_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384:: +add_mod_384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__add_mod_384_a_is_loaded:: + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + +PUBLIC add_mod_384x + + +ALIGN 32 +add_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,24 + +$L$SEH_body_add_mod_384x:: + + + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + lea rsi,QWORD PTR[48+rsi] + lea rdx,QWORD PTR[48+rdx] + lea rdi,QWORD PTR[48+rdi] + call __add_mod_384 + + mov rsi,QWORD PTR[rsp] + mov rdx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + mov r15,QWORD PTR[((24+0))+rsp] + + mov r14,QWORD PTR[((24+8))+rsp] + + mov r13,QWORD PTR[((24+16))+rsp] + + mov r12,QWORD PTR[((24+24))+rsp] + + mov rbx,QWORD PTR[((24+32))+rsp] + + mov rbp,QWORD PTR[((24+40))+rsp] + + lea rsp,QWORD PTR[((24+48))+rsp] + +$L$SEH_epilogue_add_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384x:: +add_mod_384x ENDP + + +PUBLIC rshift_mod_384 + + +ALIGN 32 +rshift_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_rshift_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_rshift_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +$L$oop_rshift_mod_384:: + call __rshift_mod_384 + dec edx + jnz $L$oop_rshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_rshift_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_rshift_mod_384:: +rshift_mod_384 ENDP + + +ALIGN 32 +__rshift_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov rsi,1 + mov r14,QWORD PTR[rcx] + and rsi,r8 + mov r15,QWORD PTR[8+rcx] + neg rsi + mov rax,QWORD PTR[16+rcx] + and r14,rsi + mov rbx,QWORD PTR[24+rcx] + and r15,rsi + mov rbp,QWORD PTR[32+rcx] + and rax,rsi + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[40+rcx] + + add r14,r8 + adc r15,r9 + adc rax,r10 + adc rbx,r11 + adc rbp,r12 + adc rsi,r13 + sbb r13,r13 + + shr r14,1 + mov r8,r15 + shr r15,1 + mov r9,rax + shr rax,1 + mov r10,rbx + shr rbx,1 + mov r11,rbp + shr rbp,1 + mov r12,rsi + shr rsi,1 + shl r8,63 + shl r9,63 + or r8,r14 + shl r10,63 + or r9,r15 + shl r11,63 + or r10,rax + shl r12,63 + or r11,rbx + shl r13,63 + or r12,rbp + or r13,rsi + + DB 0F3h,0C3h ;repret +__rshift_mod_384 ENDP + +PUBLIC div_by_2_mod_384 + + +ALIGN 32 +div_by_2_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_div_by_2_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_div_by_2_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov rcx,rdx + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + call __rshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_div_by_2_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_div_by_2_mod_384:: +div_by_2_mod_384 ENDP + + +PUBLIC lshift_mod_384 + + +ALIGN 32 +lshift_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_lshift_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_lshift_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +$L$oop_lshift_mod_384:: + add r8,r8 + adc r9,r9 + adc r10,r10 + mov r14,r8 + adc r11,r11 + mov r15,r9 + adc r12,r12 + mov rax,r10 + adc r13,r13 + mov rbx,r11 + sbb rdi,rdi + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdi,0 + + mov rdi,QWORD PTR[rsp] + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + cmovc r11,rbx + cmovc r12,rbp + cmovc r13,rsi + + dec edx + jnz $L$oop_lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_lshift_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_lshift_mod_384:: +lshift_mod_384 ENDP + + +ALIGN 32 +__lshift_mod_384 PROC PRIVATE + DB 243,15,30,250 + add r8,r8 + adc r9,r9 + adc r10,r10 + mov r14,r8 + adc r11,r11 + mov r15,r9 + adc r12,r12 + mov rax,r10 + adc r13,r13 + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + cmovc r11,rbx + cmovc r12,rbp + cmovc r13,rsi + + DB 0F3h,0C3h ;repret +__lshift_mod_384 ENDP + + +PUBLIC mul_by_3_mod_384 + + +ALIGN 32 +mul_by_3_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_3_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + + mov rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_3_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_384:: +mul_by_3_mod_384 ENDP + +PUBLIC mul_by_8_mod_384 + + +ALIGN 32 +mul_by_8_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_8_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mul_by_8_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_8_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_8_mod_384:: +mul_by_8_mod_384 ENDP + + +PUBLIC mul_by_3_mod_384x + + +ALIGN 32 +mul_by_3_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_3_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + + mov rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov rsi,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + + mov r8,QWORD PTR[48+rsi] + mov r9,QWORD PTR[56+rsi] + mov r10,QWORD PTR[64+rsi] + mov r11,QWORD PTR[72+rsi] + mov r12,QWORD PTR[80+rsi] + mov r13,QWORD PTR[88+rsi] + + call __lshift_mod_384 + + mov rdx,8*6 + add rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_3_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_384x:: +mul_by_3_mod_384x ENDP + +PUBLIC mul_by_8_mod_384x + + +ALIGN 32 +mul_by_8_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_8_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_8_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov rsi,QWORD PTR[rsp] + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[((48+0))+rdi],r8 + mov QWORD PTR[((48+8))+rdi],r9 + mov QWORD PTR[((48+16))+rdi],r10 + mov QWORD PTR[((48+24))+rdi],r11 + mov QWORD PTR[((48+32))+rdi],r12 + mov QWORD PTR[((48+40))+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_8_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_8_mod_384x:: +mul_by_8_mod_384x ENDP + + +PUBLIC cneg_mod_384 + + +ALIGN 32 +cneg_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_cneg_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdx + +$L$SEH_body_cneg_mod_384:: + + + mov rdx,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r8,rdx + mov r11,QWORD PTR[24+rsi] + or rdx,r9 + mov r12,QWORD PTR[32+rsi] + or rdx,r10 + mov r13,QWORD PTR[40+rsi] + or rdx,r11 + mov rsi,-1 + or rdx,r12 + or rdx,r13 + + mov r14,QWORD PTR[rcx] + cmovnz rdx,rsi + mov r15,QWORD PTR[8+rcx] + mov rax,QWORD PTR[16+rcx] + and r14,rdx + mov rbx,QWORD PTR[24+rcx] + and r15,rdx + mov rbp,QWORD PTR[32+rcx] + and rax,rdx + mov rsi,QWORD PTR[40+rcx] + and rbx,rdx + mov rcx,QWORD PTR[rsp] + and rbp,rdx + and rsi,rdx + + sub r14,r8 + sbb r15,r9 + sbb rax,r10 + sbb rbx,r11 + sbb rbp,r12 + sbb rsi,r13 + + or rcx,rcx + + cmovz r14,r8 + cmovz r15,r9 + cmovz rax,r10 + mov QWORD PTR[rdi],r14 + cmovz rbx,r11 + mov QWORD PTR[8+rdi],r15 + cmovz rbp,r12 + mov QWORD PTR[16+rdi],rax + cmovz rsi,r13 + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rsi + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_cneg_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_cneg_mod_384:: +cneg_mod_384 ENDP + + +PUBLIC sub_mod_384 + + +ALIGN 32 +sub_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sub_mod_384:: + + + call __sub_mod_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sub_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384:: +sub_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP + +PUBLIC sub_mod_384x + + +ALIGN 32 +sub_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,24 + +$L$SEH_body_sub_mod_384x:: + + + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + lea rsi,QWORD PTR[48+rsi] + lea rdx,QWORD PTR[48+rdx] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384 + + mov rsi,QWORD PTR[rsp] + mov rdx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __sub_mod_384 + + mov r15,QWORD PTR[((24+0))+rsp] + + mov r14,QWORD PTR[((24+8))+rsp] + + mov r13,QWORD PTR[((24+16))+rsp] + + mov r12,QWORD PTR[((24+24))+rsp] + + mov rbx,QWORD PTR[((24+32))+rsp] + + mov rbp,QWORD PTR[((24+40))+rsp] + + lea rsp,QWORD PTR[((24+48))+rsp] + +$L$SEH_epilogue_sub_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384x:: +sub_mod_384x ENDP +PUBLIC mul_by_1_plus_i_mod_384x + + +ALIGN 32 +mul_by_1_plus_i_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_1_plus_i_mod_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,56 + +$L$SEH_body_mul_by_1_plus_i_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rbx,r11 + adc r11,QWORD PTR[72+rsi] + mov rcx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + mov QWORD PTR[48+rsp],rdi + sbb rdi,rdi + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rbx,QWORD PTR[72+rsi] + sbb rcx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rsi,rsi + + mov QWORD PTR[rsp],r8 + mov r8,QWORD PTR[rdx] + mov QWORD PTR[8+rsp],r9 + mov r9,QWORD PTR[8+rdx] + mov QWORD PTR[16+rsp],r10 + mov r10,QWORD PTR[16+rdx] + mov QWORD PTR[24+rsp],r11 + mov r11,QWORD PTR[24+rdx] + mov QWORD PTR[32+rsp],r12 + and r8,rsi + mov r12,QWORD PTR[32+rdx] + mov QWORD PTR[40+rsp],r13 + and r9,rsi + mov r13,QWORD PTR[40+rdx] + and r10,rsi + and r11,rsi + and r12,rsi + and r13,rsi + mov rsi,QWORD PTR[48+rsp] + + add r14,r8 + mov r8,QWORD PTR[rsp] + adc r15,r9 + mov r9,QWORD PTR[8+rsp] + adc rax,r10 + mov r10,QWORD PTR[16+rsp] + adc rbx,r11 + mov r11,QWORD PTR[24+rsp] + adc rcx,r12 + mov r12,QWORD PTR[32+rsp] + adc rbp,r13 + mov r13,QWORD PTR[40+rsp] + + mov QWORD PTR[rsi],r14 + mov r14,r8 + mov QWORD PTR[8+rsi],r15 + mov QWORD PTR[16+rsi],rax + mov r15,r9 + mov QWORD PTR[24+rsi],rbx + mov QWORD PTR[32+rsi],rcx + mov rax,r10 + mov QWORD PTR[40+rsi],rbp + + sub r8,QWORD PTR[rdx] + mov rbx,r11 + sbb r9,QWORD PTR[8+rdx] + sbb r10,QWORD PTR[16+rdx] + mov rcx,r12 + sbb r11,QWORD PTR[24+rdx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,r13 + sbb r13,QWORD PTR[40+rdx] + sbb rdi,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[48+rsi],r8 + cmovc r11,rbx + mov QWORD PTR[56+rsi],r9 + cmovc r12,rcx + mov QWORD PTR[64+rsi],r10 + cmovc r13,rbp + mov QWORD PTR[72+rsi],r11 + mov QWORD PTR[80+rsi],r12 + mov QWORD PTR[88+rsi],r13 + + mov r15,QWORD PTR[((56+0))+rsp] + + mov r14,QWORD PTR[((56+8))+rsp] + + mov r13,QWORD PTR[((56+16))+rsp] + + mov r12,QWORD PTR[((56+24))+rsp] + + mov rbx,QWORD PTR[((56+32))+rsp] + + mov rbp,QWORD PTR[((56+40))+rsp] + + lea rsp,QWORD PTR[((56+48))+rsp] + +$L$SEH_epilogue_mul_by_1_plus_i_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_1_plus_i_mod_384x:: +mul_by_1_plus_i_mod_384x ENDP +PUBLIC sgn0_pty_mod_384 + + +ALIGN 32 +sgn0_pty_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mod_384:: + mov rdi,rcx + mov rsi,rdx + + + +$L$SEH_body_sgn0_pty_mod_384:: + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov rcx,QWORD PTR[32+rdi] + mov rdx,QWORD PTR[40+rdi] + + xor rax,rax + mov rdi,r8 + add r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rax,0 + + sub r8,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rax,0 + + not rax + and rdi,1 + and rax,2 + or rax,rdi + +$L$SEH_epilogue_sgn0_pty_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mod_384:: +sgn0_pty_mod_384 ENDP + +PUBLIC sgn0_pty_mod_384x + + +ALIGN 32 +sgn0_pty_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mod_384x:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mod_384x:: + + + mov r8,QWORD PTR[48+rdi] + mov r9,QWORD PTR[56+rdi] + mov r10,QWORD PTR[64+rdi] + mov r11,QWORD PTR[72+rdi] + mov rcx,QWORD PTR[80+rdi] + mov rdx,QWORD PTR[88+rdi] + + mov rbx,r8 + or r8,r9 + or r8,r10 + or r8,r11 + or r8,rcx + or r8,rdx + + lea rax,QWORD PTR[rdi] + xor rdi,rdi + mov rbp,rbx + add rbx,rbx + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rdi,0 + + sub rbx,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rdi,0 + + mov QWORD PTR[rsp],r8 + not rdi + and rbp,1 + and rdi,2 + or rdi,rbp + + mov r8,QWORD PTR[rax] + mov r9,QWORD PTR[8+rax] + mov r10,QWORD PTR[16+rax] + mov r11,QWORD PTR[24+rax] + mov rcx,QWORD PTR[32+rax] + mov rdx,QWORD PTR[40+rax] + + mov rbx,r8 + or r8,r9 + or r8,r10 + or r8,r11 + or r8,rcx + or r8,rdx + + xor rax,rax + mov rbp,rbx + add rbx,rbx + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rax,0 + + sub rbx,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rax,0 + + mov rbx,QWORD PTR[rsp] + + not rax + + test r8,r8 + cmovz rbp,rdi + + test rbx,rbx + cmovnz rax,rdi + + and rbp,1 + and rax,2 + or rax,rbp + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sgn0_pty_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mod_384x:: +sgn0_pty_mod_384x ENDP +PUBLIC vec_select_32 + + +ALIGN 32 +vec_select_32 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[16+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[16+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[16+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-16))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-16))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-16)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-16)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_32 ENDP +PUBLIC vec_select_48 + + +ALIGN 32 +vec_select_48 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[24+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[24+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[24+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-24))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-24))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-24)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-24))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-24))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-24)+rcx],xmm2 + pand xmm0,xmm4 + pand xmm1,xmm5 + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-24)+rcx],xmm0 + DB 0F3h,0C3h ;repret +vec_select_48 ENDP +PUBLIC vec_select_96 + + +ALIGN 32 +vec_select_96 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[48+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[48+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[48+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-48)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-48))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-48))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-48)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-48)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-48))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-48))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-48)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-48)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-48)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_96 ENDP +PUBLIC vec_select_192 + + +ALIGN 32 +vec_select_192 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[96+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[96+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[96+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((128+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((128+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((144+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((144+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(144-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((160+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((160+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(160-96)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(176-96)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_192 ENDP +PUBLIC vec_select_144 + + +ALIGN 32 +vec_select_144 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[72+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[72+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[72+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-72)+rcx],xmm2 + pand xmm0,xmm4 + pand xmm1,xmm5 + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-72)+rcx],xmm0 + DB 0F3h,0C3h ;repret +vec_select_144 ENDP +PUBLIC vec_select_288 + + +ALIGN 32 +vec_select_288 PROC PUBLIC + DB 243,15,30,250 + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[144+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[144+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[144+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((128+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((128+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((144+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((144+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(144-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((160+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((160+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(160-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((176+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((176+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(176-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((192+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((192+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(192-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((208+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((208+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(208-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((224+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((224+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(224-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((240+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((240+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(240-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((256+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((256+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(256-144)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(272-144)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_288 ENDP +PUBLIC vec_prefetch + + +ALIGN 32 +vec_prefetch PROC PUBLIC + DB 243,15,30,250 + lea rdx,QWORD PTR[((-1))+rdx*1+rcx] + mov rax,64 + xor r8,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + prefetchnta [rcx] + DB 0F3h,0C3h ;repret +vec_prefetch ENDP +PUBLIC vec_is_zero_16x + + +ALIGN 32 +vec_is_zero_16x PROC PUBLIC + DB 243,15,30,250 + shr edx,4 + movdqu xmm0,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + +$L$oop_is_zero:: + dec edx + jz $L$oop_is_zero_done + movdqu xmm1,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + por xmm0,xmm1 + jmp $L$oop_is_zero + +$L$oop_is_zero_done:: + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 +DB 102,72,15,126,192 + inc edx + test rax,rax + cmovnz eax,edx + xor eax,1 + DB 0F3h,0C3h ;repret +vec_is_zero_16x ENDP +PUBLIC vec_is_equal_16x + + +ALIGN 32 +vec_is_equal_16x PROC PUBLIC + DB 243,15,30,250 + shr r8d,4 + movdqu xmm0,XMMWORD PTR[rcx] + movdqu xmm1,XMMWORD PTR[rdx] + sub rdx,rcx + lea rcx,QWORD PTR[16+rcx] + pxor xmm0,xmm1 + +$L$oop_is_equal:: + dec r8d + jz $L$oop_is_equal_done + movdqu xmm1,XMMWORD PTR[rcx] + movdqu xmm2,XMMWORD PTR[rdx*1+rcx] + lea rcx,QWORD PTR[16+rcx] + pxor xmm1,xmm2 + por xmm0,xmm1 + jmp $L$oop_is_equal + +$L$oop_is_equal_done:: + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 +DB 102,72,15,126,192 + inc r8d + test rax,rax + cmovnz eax,r8d + xor eax,1 + DB 0F3h,0C3h ;repret +vec_is_equal_16x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_384 + DD imagerel $L$SEH_body_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_prologue + + DD imagerel $L$SEH_body_add_mod_384 + DD imagerel $L$SEH_epilogue_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_body + + DD imagerel $L$SEH_epilogue_add_mod_384 + DD imagerel $L$SEH_end_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_epilogue + + DD imagerel $L$SEH_begin_add_mod_384x + DD imagerel $L$SEH_body_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_prologue + + DD imagerel $L$SEH_body_add_mod_384x + DD imagerel $L$SEH_epilogue_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_body + + DD imagerel $L$SEH_epilogue_add_mod_384x + DD imagerel $L$SEH_end_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_epilogue + + DD imagerel $L$SEH_begin_rshift_mod_384 + DD imagerel $L$SEH_body_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_prologue + + DD imagerel $L$SEH_body_rshift_mod_384 + DD imagerel $L$SEH_epilogue_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_body + + DD imagerel $L$SEH_epilogue_rshift_mod_384 + DD imagerel $L$SEH_end_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_epilogue + + DD imagerel $L$SEH_begin_div_by_2_mod_384 + DD imagerel $L$SEH_body_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_prologue + + DD imagerel $L$SEH_body_div_by_2_mod_384 + DD imagerel $L$SEH_epilogue_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_body + + DD imagerel $L$SEH_epilogue_div_by_2_mod_384 + DD imagerel $L$SEH_end_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_epilogue + + DD imagerel $L$SEH_begin_lshift_mod_384 + DD imagerel $L$SEH_body_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_prologue + + DD imagerel $L$SEH_body_lshift_mod_384 + DD imagerel $L$SEH_epilogue_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_body + + DD imagerel $L$SEH_epilogue_lshift_mod_384 + DD imagerel $L$SEH_end_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_384 + DD imagerel $L$SEH_body_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_384 + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 + DD imagerel $L$SEH_end_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_8_mod_384 + DD imagerel $L$SEH_body_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_prologue + + DD imagerel $L$SEH_body_mul_by_8_mod_384 + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_body + + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 + DD imagerel $L$SEH_end_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_384x + DD imagerel $L$SEH_body_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x + DD imagerel $L$SEH_end_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_8_mod_384x + DD imagerel $L$SEH_body_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_8_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x + DD imagerel $L$SEH_end_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_epilogue + + DD imagerel $L$SEH_begin_cneg_mod_384 + DD imagerel $L$SEH_body_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_prologue + + DD imagerel $L$SEH_body_cneg_mod_384 + DD imagerel $L$SEH_epilogue_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_body + + DD imagerel $L$SEH_epilogue_cneg_mod_384 + DD imagerel $L$SEH_end_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384 + DD imagerel $L$SEH_body_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_prologue + + DD imagerel $L$SEH_body_sub_mod_384 + DD imagerel $L$SEH_epilogue_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_body + + DD imagerel $L$SEH_epilogue_sub_mod_384 + DD imagerel $L$SEH_end_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384x + DD imagerel $L$SEH_body_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_prologue + + DD imagerel $L$SEH_body_sub_mod_384x + DD imagerel $L$SEH_epilogue_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_body + + DD imagerel $L$SEH_epilogue_sub_mod_384x + DD imagerel $L$SEH_end_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_end_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mod_384 + DD imagerel $L$SEH_body_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mod_384 + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 + DD imagerel $L$SEH_end_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mod_384x + DD imagerel $L$SEH_body_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mod_384x + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x + DD imagerel $L$SEH_end_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_add_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_add_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_add_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_rshift_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_rshift_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_rshift_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_div_by_2_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_div_by_2_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_div_by_2_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_lshift_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_lshift_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_lshift_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_3_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_8_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_8_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_8_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_3_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_3_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_8_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_8_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_by_8_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_cneg_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_cneg_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_cneg_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sub_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_sub_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_1_plus_i_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_by_1_plus_i_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,007h,000h +DB 000h,0e4h,008h,000h +DB 000h,0d4h,009h,000h +DB 000h,0c4h,00ah,000h +DB 000h,034h,00bh,000h +DB 000h,054h,00ch,000h +DB 000h,074h,00eh,000h +DB 000h,064h,00fh,000h +DB 000h,0c2h +DB 000h,000h +$L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mod_384_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0_pty_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mod_384x_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h +$L$SEH_info_sgn0_pty_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm new file mode 100644 index 00000000000..57d1752fd3c --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm @@ -0,0 +1,334 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + +ALIGN 32 +__add_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + add r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + adc r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + adc r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + adc r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + adc r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + adc r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + adc r14,QWORD PTR[48+rdx] + mov QWORD PTR[8+rdi],r9 + adc r15,QWORD PTR[56+rdx] + mov QWORD PTR[16+rdi],r10 + adc rax,QWORD PTR[64+rdx] + mov QWORD PTR[32+rdi],r12 + mov r8,r14 + adc rbx,QWORD PTR[72+rdx] + mov QWORD PTR[24+rdi],r11 + mov r9,r15 + adc rbp,QWORD PTR[80+rdx] + mov QWORD PTR[40+rdi],r13 + mov r10,rax + adc rsi,QWORD PTR[88+rdx] + mov r11,rbx + sbb rdx,rdx + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov r12,rbp + sbb rax,QWORD PTR[16+rcx] + sbb rbx,QWORD PTR[24+rcx] + sbb rbp,QWORD PTR[32+rcx] + mov r13,rsi + sbb rsi,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r14,r8 + cmovc r15,r9 + cmovc rax,r10 + mov QWORD PTR[48+rdi],r14 + cmovc rbx,r11 + mov QWORD PTR[56+rdi],r15 + cmovc rbp,r12 + mov QWORD PTR[64+rdi],rax + cmovc rsi,r13 + mov QWORD PTR[72+rdi],rbx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__add_mod_384x384 ENDP + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + +PUBLIC add_mod_384x384 + + +ALIGN 32 +add_mod_384x384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384x384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_add_mod_384x384:: + + + call __add_mod_384x384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_add_mod_384x384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384x384:: +add_mod_384x384 ENDP + +PUBLIC sub_mod_384x384 + + +ALIGN 32 +sub_mod_384x384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384x384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sub_mod_384x384:: + + + call __sub_mod_384x384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sub_mod_384x384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384x384:: +sub_mod_384x384 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_384x384 + DD imagerel $L$SEH_body_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_prologue + + DD imagerel $L$SEH_body_add_mod_384x384 + DD imagerel $L$SEH_epilogue_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_body + + DD imagerel $L$SEH_epilogue_add_mod_384x384 + DD imagerel $L$SEH_end_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384x384 + DD imagerel $L$SEH_body_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_prologue + + DD imagerel $L$SEH_body_sub_mod_384x384 + DD imagerel $L$SEH_epilogue_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_body + + DD imagerel $L$SEH_epilogue_sub_mod_384x384 + DD imagerel $L$SEH_end_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_384x384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_add_mod_384x384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_add_mod_384x384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384x384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sub_mod_384x384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sub_mod_384x384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/blst.def b/crypto/blst_src/build/win64/blst.def new file mode 100644 index 00000000000..3fbb6b3a97d --- /dev/null +++ b/crypto/blst_src/build/win64/blst.def @@ -0,0 +1,217 @@ +LIBRARY blst + +EXPORTS + blst_scalar_from_uint32 + blst_uint32_from_scalar + blst_scalar_from_uint64 + blst_uint64_from_scalar + blst_scalar_from_bendian + blst_bendian_from_scalar + blst_scalar_from_lendian + blst_lendian_from_scalar + blst_scalar_fr_check + blst_sk_check + blst_sk_add_n_check + blst_sk_sub_n_check + blst_sk_mul_n_check + blst_sk_inverse + blst_scalar_from_le_bytes + blst_scalar_from_be_bytes + blst_fr_add + blst_fr_sub + blst_fr_mul_by_3 + blst_fr_lshift + blst_fr_rshift + blst_fr_mul + blst_fr_sqr + blst_fr_cneg + blst_fr_eucl_inverse + blst_fr_inverse + blst_fr_from_uint64 + blst_uint64_from_fr + blst_fr_from_scalar + blst_scalar_from_fr + blst_fp_add + blst_fp_sub + blst_fp_mul_by_3 + blst_fp_mul_by_8 + blst_fp_lshift + blst_fp_mul + blst_fp_sqr + blst_fp_cneg + blst_fp_eucl_inverse + blst_fp_inverse + blst_fp_sqrt + blst_fp_from_uint32 + blst_uint32_from_fp + blst_fp_from_uint64 + blst_uint64_from_fp + blst_fp_from_bendian + blst_bendian_from_fp + blst_fp_from_lendian + blst_lendian_from_fp + blst_fp2_add + blst_fp2_sub + blst_fp2_mul_by_3 + blst_fp2_mul_by_8 + blst_fp2_lshift + blst_fp2_mul + blst_fp2_sqr + blst_fp2_cneg + blst_fp2_eucl_inverse + blst_fp2_inverse + blst_fp2_sqrt + blst_fp12_sqr + blst_fp12_cyclotomic_sqr + blst_fp12_mul + blst_fp12_mul_by_xy00z0 + blst_fp12_conjugate + blst_fp12_inverse + blst_fp12_frobenius_map + blst_fp12_is_equal + blst_fp12_is_one + blst_fp12_in_group + blst_fp12_one + blst_p1_add + blst_p1_add_or_double + blst_p1_add_affine + blst_p1_add_or_double_affine + blst_p1_double + blst_p1_mult + blst_p1_cneg + blst_p1_to_affine + blst_p1_from_affine + blst_p1_on_curve + blst_p1_in_g1 + blst_p1_is_equal + blst_p1_is_inf + blst_p1_generator + blst_p1_affine_on_curve + blst_p1_affine_in_g1 + blst_p1_affine_is_equal + blst_p1_affine_is_inf + blst_p1_affine_generator + blst_p2_add + blst_p2_add_or_double + blst_p2_add_affine + blst_p2_add_or_double_affine + blst_p2_double + blst_p2_mult + blst_p2_cneg + blst_p2_to_affine + blst_p2_from_affine + blst_p2_on_curve + blst_p2_in_g2 + blst_p2_is_equal + blst_p2_is_inf + blst_p2_generator + blst_p2_affine_on_curve + blst_p2_affine_in_g2 + blst_p2_affine_is_equal + blst_p2_affine_is_inf + blst_p2_affine_generator + blst_p1s_to_affine + blst_p1s_add + blst_p1s_mult_wbits_precompute_sizeof + blst_p1s_mult_wbits_precompute + blst_p1s_mult_wbits_scratch_sizeof + blst_p1s_mult_wbits + blst_p1s_mult_pippenger_scratch_sizeof + blst_p1s_mult_pippenger + blst_p1s_tile_pippenger + blst_p2s_to_affine + blst_p2s_add + blst_p2s_mult_wbits_precompute_sizeof + blst_p2s_mult_wbits_precompute + blst_p2s_mult_wbits_scratch_sizeof + blst_p2s_mult_wbits + blst_p2s_mult_pippenger_scratch_sizeof + blst_p2s_mult_pippenger + blst_p2s_tile_pippenger + blst_map_to_g1 + blst_map_to_g2 + blst_encode_to_g1 + blst_hash_to_g1 + blst_encode_to_g2 + blst_hash_to_g2 + blst_p1_serialize + blst_p1_compress + blst_p1_affine_serialize + blst_p1_affine_compress + blst_p1_uncompress + blst_p1_deserialize + blst_p2_serialize + blst_p2_compress + blst_p2_affine_serialize + blst_p2_affine_compress + blst_p2_uncompress + blst_p2_deserialize + blst_keygen + blst_sk_to_pk_in_g1 + blst_sign_pk_in_g1 + blst_sk_to_pk_in_g2 + blst_sign_pk_in_g2 + blst_miller_loop + blst_final_exp + blst_precompute_lines + blst_miller_loop_lines + blst_fp12_finalverify + blst_pairing_sizeof + blst_pairing_init + blst_pairing_get_dst + blst_pairing_commit + blst_pairing_aggregate_pk_in_g2 + blst_pairing_chk_n_aggr_pk_in_g2 + blst_pairing_mul_n_aggregate_pk_in_g2 + blst_pairing_chk_n_mul_n_aggr_pk_in_g2 + blst_pairing_aggregate_pk_in_g1 + blst_pairing_chk_n_aggr_pk_in_g1 + blst_pairing_mul_n_aggregate_pk_in_g1 + blst_pairing_chk_n_mul_n_aggr_pk_in_g1 + blst_pairing_merge + blst_pairing_finalverify + blst_aggregate_in_g1 + blst_aggregate_in_g2 + blst_aggregated_in_g1 + blst_aggregated_in_g2 + blst_core_verify_pk_in_g1 + blst_core_verify_pk_in_g2 + BLS12_381_G1 + BLS12_381_NEG_G1 + BLS12_381_G2 + BLS12_381_NEG_G2 + blst_fr_to + blst_fr_from + blst_fp_to + blst_fp_from + blst_fp_is_square + blst_fp2_is_square + blst_p1_from_jacobian + blst_p2_from_jacobian + blst_sk_to_pk2_in_g1 + blst_sign_pk2_in_g1 + blst_sk_to_pk2_in_g2 + blst_sign_pk2_in_g2 + blst_uniq_sizeof + blst_uniq_init + blst_uniq_test + blst_expand_message_xmd + blst_p1_unchecked_mult + blst_p2_unchecked_mult + blst_pairing_raw_aggregate + blst_pairing_as_fp12 + blst_bendian_from_fp12 + blst_keygen_v3 + blst_keygen_v4_5 + blst_keygen_v5 + blst_derive_master_eip2333 + blst_derive_child_eip2333 + blst_scalar_from_hexascii + blst_fr_from_hexascii + blst_fp_from_hexascii + blst_p1_sizeof + blst_p1_affine_sizeof + blst_p2_sizeof + blst_p2_affine_sizeof + blst_fp12_sizeof + diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm new file mode 100644 index 00000000000..f3c2f0d05f9 --- /dev/null +++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm @@ -0,0 +1,785 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |ct_inverse_mod_256|[FUNC] + ALIGN 32 +|ct_inverse_mod_256| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl |$Lab_approximation_31_256_loaded| + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + DCDU 3573752767 + ret + ENDP + +//////////////////////////////////////////////////////////////////////// + + ALIGN 32 +|__smul_256x63| PROC + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + cselne x22,x22,xzr + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + cselne x23,x23,xzr + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + ENDP + + + ALIGN 32 +|__smul_512x63_tail| PROC + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + ENDP + + + ALIGN 32 +|__smul_256_n_shift_by_31| PROC + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + ENDP + + ALIGN 16 +|__ab_approximation_31_256| PROC + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +|$Lab_approximation_31_256_loaded| + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + cselne x7,x7,x6 + cselne x11,x11,x10 + cselne x6,x6,x5 + orr x19, x7, x11 // and ones before top-most, ... + cselne x10,x10,x9 + + cmp x19, #0 + cselne x7,x7,x6 + cselne x11,x11,x10 + cselne x6,x6,x4 + orr x19, x7, x11 // and one more, ... + cselne x10,x10,x8 + + clz x19, x19 + cmp x19, #64 + cselne x19,x19,xzr + cselne x7,x7,x6 + cselne x11,x11,x10 + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + ENDP + + + ALIGN 16 +|__inner_loop_31_256| PROC + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +|$Loop_31_256| + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + cselhs x11,x11,x7 + cselhs x7,x21,x20 + cselhs x15,x15,x13 + cselhs x13,x13,x19 + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, |$Loop_31_256| + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + ENDP + + + ALIGN 16 +|__inner_loop_62_256| PROC + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +|$Loop_62_256| + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + cselhs x11,x11,x7 + cselhs x7,x21,x20 + mov x20, x13 + cselhs x12,x12,x14 + cselhs x14,x14,x19 + cselhs x13,x13,x15 + cselhs x15,x15,x20 + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, |$Loop_62_256| + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm new file mode 100644 index 00000000000..65665c9f17a --- /dev/null +++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm @@ -0,0 +1,1211 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_inverse_mod_256 + +ALIGN 32 +ct_inverse_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_inverse_mod_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1072 + +$L$SEH_body_ct_inverse_mod_256:: + + + lea rax,QWORD PTR[((48+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + mov r12,QWORD PTR[rdx] + mov r13,QWORD PTR[8+rdx] + mov r14,QWORD PTR[16+rdx] + mov r15,QWORD PTR[24+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov rsi,rax + + + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + + + mov QWORD PTR[64+rdi],rdx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + + + mov QWORD PTR[72+rdi],rdx + + + xor rsi,256 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + + + + mov r8,QWORD PTR[64+rsi] + mov r12,QWORD PTR[104+rsi] + mov r9,r8 + imul r8,QWORD PTR[rsp] + mov r13,r12 + imul r12,QWORD PTR[8+rsp] + add r8,r12 + mov QWORD PTR[32+rdi],r8 + sar r8,63 + mov QWORD PTR[40+rdi],r8 + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r8 + mov QWORD PTR[64+rdi],r8 + lea rsi,QWORD PTR[64+rsi] + + imul r9,rdx + imul r13,rcx + add r9,r13 + mov QWORD PTR[72+rdi],r9 + sar r9,63 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + mov QWORD PTR[96+rdi],r9 + mov QWORD PTR[104+rdi],r9 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + sar rbp,63 + mov QWORD PTR[40+rdi],rbp + mov QWORD PTR[48+rdi],rbp + mov QWORD PTR[56+rdi],rbp + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + + xor rsi,256+8*8 + mov edx,47 + + mov r8,QWORD PTR[rsi] + + mov r10,QWORD PTR[32+rsi] + + call __inner_loop_62_256 + + + + + + + + lea rsi,QWORD PTR[64+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulq_512x63 + adc rdx,rbp + + mov rsi,QWORD PTR[40+rsp] + mov rax,rdx + sar rdx,63 + + mov r8,rdx + mov r9,rdx + and r8,QWORD PTR[rsi] + mov r10,rdx + and r9,QWORD PTR[8+rsi] + and r10,QWORD PTR[16+rsi] + and rdx,QWORD PTR[24+rsi] + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,rdx + adc rax,0 + + mov rdx,rax + neg rax + or rdx,rax + sar rax,63 + + mov r8,rdx + mov r9,rdx + and r8,QWORD PTR[rsi] + mov r10,rdx + and r9,QWORD PTR[8+rsi] + and r10,QWORD PTR[16+rsi] + and rdx,QWORD PTR[24+rsi] + + xor r8,rax + xor rcx,rcx + xor r9,rax + sub rcx,rax + xor r10,rax + xor rdx,rax + add r8,rcx + adc r9,0 + adc r10,0 + adc rdx,0 + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,rdx + + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + + lea r8,QWORD PTR[1072+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_inverse_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_inverse_mod_256:: +ct_inverse_mod_256 ENDP + +ALIGN 32 +__smulq_512x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor rbp,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc rbp,0 + + mul rbx + mov QWORD PTR[rdi],rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov QWORD PTR[8+rdi],r9 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov QWORD PTR[16+rdi],r10 + mov r11,rdx + and rbp,rbx + neg rbp + mul rbx + add r11,rax + adc rbp,rdx + mov QWORD PTR[24+rdi],r11 + + mov r8,QWORD PTR[40+rsi] + mov r9,QWORD PTR[48+rsi] + mov r10,QWORD PTR[56+rsi] + mov r11,QWORD PTR[64+rsi] + mov r12,QWORD PTR[72+rsi] + mov r13,QWORD PTR[80+rsi] + mov r14,QWORD PTR[88+rsi] + mov r15,QWORD PTR[96+rsi] + + mov rdx,rcx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rcx,rdx + add rcx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + + mul rcx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rcx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rcx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rcx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rcx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mul rcx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rcx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + imul rcx + add r15,rax + adc rdx,0 + + mov rbx,rbp + sar rbp,63 + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,rbx + adc r13,rbp + adc r14,rbp + adc r15,rbp + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + + DB 0F3h,0C3h ;repret +__smulq_512x63 ENDP + + +ALIGN 32 +__smulq_256x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov rbp,QWORD PTR[((0+32))+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor rbp,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc rbp,0 + + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + and rbp,rbx + neg rbp + mul rbx + add r11,rax + adc rbp,rdx + mov rdx,rcx + mov r12,QWORD PTR[((40+0))+rsi] + mov r13,QWORD PTR[((40+8))+rsi] + mov r14,QWORD PTR[((40+16))+rsi] + mov r15,QWORD PTR[((40+24))+rsi] + mov rcx,QWORD PTR[((40+32))+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + xor rcx,rdx + add rax,r12 + adc r13,0 + adc r14,0 + adc r15,0 + adc rcx,0 + + mul rbx + mov r12,rax + mov rax,r13 + mov r13,rdx + mul rbx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rbx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + and rcx,rbx + neg rcx + mul rbx + add r15,rax + adc rcx,rdx + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rbp,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],rbp + + DB 0F3h,0C3h ;repret +__smulq_256x63 ENDP + +ALIGN 32 +__smulq_256_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + mov QWORD PTR[rdi],rdx + mov QWORD PTR[8+rdi],rcx + mov rbp,rdx + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + + mov rbx,rbp + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rbx,rbp + add rbx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + + mul rbx + mov r8,rax + mov rax,r9 + and rbp,rbx + neg rbp + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + adc rbp,rdx + mov r12,QWORD PTR[((32+0))+rsi] + mov r13,QWORD PTR[((32+8))+rsi] + mov r14,QWORD PTR[((32+16))+rsi] + mov r15,QWORD PTR[((32+24))+rsi] + + mov rbx,rcx + sar rcx,63 + xor rax,rax + sub rax,rcx + + xor rbx,rcx + add rbx,rax + + xor r12,rcx + xor r13,rcx + xor r14,rcx + xor r15,rcx + add rax,r12 + adc r13,0 + adc r14,0 + adc r15,0 + + mul rbx + mov r12,rax + mov rax,r13 + and rcx,rbx + neg rcx + mov r13,rdx + mul rbx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rbx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + mul rbx + add r15,rax + adc rcx,rdx + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rbp,rcx + + mov rdx,QWORD PTR[rdi] + mov rcx,QWORD PTR[8+rdi] + + shrd r8,r9,31 + shrd r9,r10,31 + shrd r10,r11,31 + shrd r11,rbp,31 + + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + xor rdx,rbp + xor rcx,rbp + add rdx,rax + add rcx,rax + + DB 0F3h,0C3h ;repret +__smulq_256_n_shift_by_31 ENDP + +ALIGN 32 +__ab_approximation_31_256 PROC PRIVATE + DB 243,15,30,250 + mov r9,QWORD PTR[24+rsi] + mov r11,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[16+rsi] + mov rbp,QWORD PTR[48+rsi] + mov r8,QWORD PTR[8+rsi] + mov r10,QWORD PTR[40+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[32+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,r8 + cmovz r11,r10 + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + mov eax,07FFFFFFFh + and r8,rax + and r10,rax + not rax + and r9,rax + and r11,rax + or r8,r9 + or r10,r11 + + jmp __inner_loop_31_256 + + DB 0F3h,0C3h ;repret +__ab_approximation_31_256 ENDP + +ALIGN 32 +__inner_loop_31_256 PROC PRIVATE + DB 243,15,30,250 + mov rcx,07FFFFFFF80000000h + mov r13,0800000007FFFFFFFh + mov r15,07FFFFFFF7FFFFFFFh + +$L$oop_31_256:: + cmp r8,r10 + mov rax,r8 + mov rbx,r10 + mov rbp,rcx + mov r14,r13 + cmovb r8,r10 + cmovb r10,rax + cmovb rcx,r13 + cmovb r13,rbp + + sub r8,r10 + sub rcx,r13 + add rcx,r15 + + test rax,1 + cmovz r8,rax + cmovz r10,rbx + cmovz rcx,rbp + cmovz r13,r14 + + shr r8,1 + add r13,r13 + sub r13,r15 + sub edx,1 + jnz $L$oop_31_256 + + shr r15,32 + mov edx,ecx + mov r12d,r13d + shr rcx,32 + shr r13,32 + sub rdx,r15 + sub rcx,r15 + sub r12,r15 + sub r13,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_31_256 ENDP + + +ALIGN 32 +__inner_loop_62_256 PROC PRIVATE + DB 243,15,30,250 + mov r15d,edx + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,rdx + mov r14,rdx + +$L$oop_62_256:: + xor rax,rax + test r8,r14 + mov rbx,r10 + cmovnz rax,r10 + sub rbx,r8 + mov rbp,r8 + sub r8,rax + cmovc r8,rbx + cmovc r10,rbp + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shr r8,1 + test rbp,r14 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub r15d,1 + jnz $L$oop_62_256 + + DB 0F3h,0C3h ;repret +__inner_loop_62_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_inverse_mod_256 + DD imagerel $L$SEH_body_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_prologue + + DD imagerel $L$SEH_body_ct_inverse_mod_256 + DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_body + + DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 + DD imagerel $L$SEH_end_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_inverse_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_ct_inverse_mod_256_body:: +DB 1,0,18,0 +DB 000h,0f4h,086h,000h +DB 000h,0e4h,087h,000h +DB 000h,0d4h,088h,000h +DB 000h,0c4h,089h,000h +DB 000h,034h,08ah,000h +DB 000h,054h,08bh,000h +DB 000h,074h,08dh,000h +DB 000h,064h,08eh,000h +DB 000h,001h,08ch,000h +$L$SEH_info_ct_inverse_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm new file mode 100644 index 00000000000..4ab12e052df --- /dev/null +++ b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm @@ -0,0 +1,718 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |ct_inverse_mod_383|[FUNC] + ALIGN 32 +|ct_inverse_mod_383| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl |$Lab_approximation_62_loaded| + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + DCDU 3573752767 + ret + ENDP + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... + + ALIGN 32 +|__smul_383x63| PROC + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + ENDP + + + ALIGN 32 +|__smul_767x63_tail| PROC + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + ENDP + + + ALIGN 32 +|__smul_383_n_shift_by_62| PROC + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + ENDP + + ALIGN 16 +|__ab_approximation_62| PROC + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +|$Lab_approximation_62_loaded| + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x6 + orr x22, x8, x14 // ... ones before top-most, ... + cselne x13,x13,x12 + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x5 + orr x22, x8, x14 // ... and ones before that ... + cselne x13,x13,x11 + + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x4 + orr x22, x8, x14 + cselne x13,x13,x10 + + clz x22, x22 + cmp x22, #64 + cselne x22,x22,xzr + cselne x8,x8,x7 + cselne x14,x14,x13 + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + ENDP + + ALIGN 16 +|__inner_loop_62| PROC + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +|$Loop_62| + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + cselhs x9,x9,x3 + cselhs x14,x14,x8 + cselhs x3,x26,x24 + cselhs x8,x27,x25 + cselhs x15,x15,x17 + cselhs x17,x17,x22 + cselhs x16,x16,x19 + cselhs x19,x19,x23 + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, |$Loop_62| + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm new file mode 100644 index 00000000000..ab72328f056 --- /dev/null +++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm @@ -0,0 +1,325 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |ct_is_square_mod_384|[FUNC] + ALIGN 32 +|ct_is_square_mod_384| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the |$Legendre| symbol + mov x15, #24 // 24 is 768/30-1 + b |$Loop_is_square| + + ALIGN 16 +|$Loop_is_square| + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, |$Loop_is_square| + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__smul_384_n_shift_by_30| PROC + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + ENDP + + ALIGN 16 +|__ab_approximation_30| PROC + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x6 + orr x21, x8, x14 // ... ones before top-most, ... + cselne x13,x13,x12 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x5 + orr x21, x8, x14 // ... and ones before that ... + cselne x13,x13,x11 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x4 + orr x21, x8, x14 // and one more, ... + cselne x13,x13,x10 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x3 + orr x21, x8, x14 + cselne x13,x13,x9 + + clz x21, x21 + cmp x21, #64 + cselne x21,x21,xzr + cselne x8,x8,x7 + cselne x14,x14,x13 + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + ENDP + + + ALIGN 16 +|__inner_loop_30| PROC + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +|$Loop_30| + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + cselhs x14,x14,x8 + cselhs x8,x23,x22 + cselhs x20,x20,x17 + cselhs x17,x17,x21 + cselhs x2,x2,x25 + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, |$Loop_30| + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + ENDP + + ALIGN 16 +|__inner_loop_48| PROC +|$Loop_48| + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + cselhs x9,x9,x3 + cselhs x3,x23,x22 + cselhs x2,x2,x25 + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, |$Loop_48| + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm new file mode 100644 index 00000000000..38de6fc1229 --- /dev/null +++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm @@ -0,0 +1,509 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_is_square_mod_384 + +ALIGN 32 +ct_is_square_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_is_square_mod_384:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,536 + +$L$SEH_body_ct_is_square_mod_384:: + + + lea rax,QWORD PTR[((24+255))+rsp] + and rax,-256 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbx,QWORD PTR[16+rsi] + mov rcx,QWORD PTR[24+rsi] + mov rdx,QWORD PTR[32+rsi] + mov rdi,QWORD PTR[40+rsi] + mov rsi,rax + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rcx + mov QWORD PTR[80+rax],rdx + mov QWORD PTR[88+rax],rdi + + xor rbp,rbp + mov ecx,24 + jmp $L$oop_is_square + +ALIGN 32 +$L$oop_is_square:: + mov DWORD PTR[16+rsp],ecx + + call __ab_approximation_30 + mov QWORD PTR[rsp],rax + mov QWORD PTR[8+rsp],rbx + + mov rdi,128+8*6 + xor rdi,rsi + call __smulq_384_n_shift_by_30 + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __smulq_384_n_shift_by_30 + + mov ecx,DWORD PTR[16+rsp] + xor rsi,128 + + and r14,QWORD PTR[48+rdi] + shr r14,1 + add rbp,r14 + + sub ecx,1 + jnz $L$oop_is_square + + + + + mov r9,QWORD PTR[48+rsi] + call __inner_loop_48 + + mov rax,1 + and rax,rbp + xor rax,1 + + lea r8,QWORD PTR[536+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_is_square_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_is_square_mod_384:: +ct_is_square_mod_384 ENDP + + +ALIGN 32 +__smulq_384_n_shift_by_30 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov r14,rdx + and r14,rbx + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + neg r14 + mul rbx + add r13,rax + adc r14,rdx + lea rsi,QWORD PTR[48+rsi] + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov r15,rdx + and r15,rbx + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + neg r15 + mul rbx + add r13,rax + adc r15,rdx + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + adc r14,r15 + + shrd r8,r9,30 + shrd r9,r10,30 + shrd r10,r11,30 + shrd r11,r12,30 + shrd r12,r13,30 + shrd r13,r14,30 + + sar r14,63 + xor rbx,rbx + sub rbx,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r8,rbx + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulq_384_n_shift_by_30 ENDP + +ALIGN 32 +__ab_approximation_30 PROC PRIVATE + DB 243,15,30,250 + mov rbx,QWORD PTR[88+rsi] + mov r15,QWORD PTR[80+rsi] + mov r14,QWORD PTR[72+rsi] + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r11 + mov r11,QWORD PTR[64+rsi] + cmovz r15,r14 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r10 + mov r10,QWORD PTR[56+rsi] + cmovz r15,r11 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r9 + mov r9,QWORD PTR[48+rsi] + cmovz r15,r10 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r8 + cmovz r15,r9 + + mov rax,r13 + or rax,rbx + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r13,r8 + cmovz rbx,r9 + cmovz rcx,rax + neg rcx + + + shld r13,r12,cl + shld rbx,r15,cl + + mov rax,0FFFFFFFF00000000h + mov r8d,r8d + mov r9d,r9d + and r13,rax + and rbx,rax + or r8,r13 + or r9,rbx + + jmp __inner_loop_30 + + DB 0F3h,0C3h ;repret +__ab_approximation_30 ENDP + +ALIGN 32 +__inner_loop_30 PROC PRIVATE + DB 243,15,30,250 + mov rbx,07FFFFFFF80000000h + mov rcx,0800000007FFFFFFFh + lea r15,QWORD PTR[((-1))+rbx] + mov edi,30 + +$L$oop_30:: + mov rax,r8 + and rax,r9 + shr rax,1 + + cmp r8,r9 + mov r10,r8 + mov r11,r9 + lea rax,QWORD PTR[rbp*1+rax] + mov r12,rbx + mov r13,rcx + mov r14,rbp + cmovb r8,r9 + cmovb r9,r10 + cmovb rbx,rcx + cmovb rcx,r12 + cmovb rbp,rax + + sub r8,r9 + sub rbx,rcx + add rbx,r15 + + test r10,1 + cmovz r8,r10 + cmovz r9,r11 + cmovz rbx,r12 + cmovz rcx,r13 + cmovz rbp,r14 + + lea rax,QWORD PTR[2+r9] + shr r8,1 + shr rax,2 + add rcx,rcx + lea rbp,QWORD PTR[rbp*1+rax] + sub rcx,r15 + + sub edi,1 + jnz $L$oop_30 + + shr r15,32 + mov eax,ebx + shr rbx,32 + mov edx,ecx + shr rcx,32 + sub rax,r15 + sub rbx,r15 + sub rdx,r15 + sub rcx,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_30 ENDP + + +ALIGN 32 +__inner_loop_48 PROC PRIVATE + DB 243,15,30,250 + mov edi,48 + +$L$oop_48:: + mov rax,r8 + and rax,r9 + shr rax,1 + + cmp r8,r9 + mov r10,r8 + mov r11,r9 + lea rax,QWORD PTR[rbp*1+rax] + mov r12,rbp + cmovb r8,r9 + cmovb r9,r10 + cmovb rbp,rax + + sub r8,r9 + + test r10,1 + cmovz r8,r10 + cmovz r9,r11 + cmovz rbp,r12 + + lea rax,QWORD PTR[2+r9] + shr r8,1 + shr rax,2 + add rbp,rax + + sub edi,1 + jnz $L$oop_48 + + DB 0F3h,0C3h ;repret +__inner_loop_48 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_is_square_mod_384 + DD imagerel $L$SEH_body_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_prologue + + DD imagerel $L$SEH_body_ct_is_square_mod_384 + DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_body + + DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 + DD imagerel $L$SEH_end_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_is_square_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_ct_is_square_mod_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,043h,000h +DB 000h,0e4h,044h,000h +DB 000h,0d4h,045h,000h +DB 000h,0c4h,046h,000h +DB 000h,034h,047h,000h +DB 000h,054h,048h,000h +DB 000h,074h,04ah,000h +DB 000h,064h,04bh,000h +DB 000h,001h,049h,000h +$L$SEH_info_ct_is_square_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm new file mode 100644 index 00000000000..de79f8ec80e --- /dev/null +++ b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm @@ -0,0 +1,1224 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_inverse_mod_383 + +ALIGN 32 +ct_inverse_mod_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_inverse_mod_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1112 + +$L$SEH_body_ct_inverse_mod_383:: + + + lea rax,QWORD PTR[((88+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,QWORD PTR[rdx] + mov r15,QWORD PTR[8+rdx] + mov rbx,QWORD PTR[16+rdx] + mov rbp,QWORD PTR[24+rdx] + mov rsi,QWORD PTR[32+rdx] + mov rdi,QWORD PTR[40+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rbp + mov QWORD PTR[80+rax],rsi + mov rsi,rax + mov QWORD PTR[88+rax],rdi + + + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + + + mov QWORD PTR[96+rdi],rdx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + + + mov QWORD PTR[96+rdi],rdx + + + xor rsi,256 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + + + + mov rax,QWORD PTR[96+rsi] + mov r11,QWORD PTR[144+rsi] + mov rbx,rdx + mov r10,rax + imul QWORD PTR[56+rsp] + mov r8,rax + mov rax,r11 + mov r9,rdx + imul QWORD PTR[64+rsp] + add r8,rax + adc r9,rdx + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + sar r9,63 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + lea rsi,QWORD PTR[96+rsi] + + mov rax,r10 + imul rbx + mov r8,rax + mov rax,r11 + mov r9,rdx + imul rcx + add r8,rax + adc r9,rdx + mov QWORD PTR[96+rdi],r8 + mov QWORD PTR[104+rdi],r9 + sar r9,63 + mov QWORD PTR[112+rdi],r9 + mov QWORD PTR[120+rdi],r9 + mov QWORD PTR[128+rdi],r9 + mov QWORD PTR[136+rdi],r9 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + sar r13,63 + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r13 + mov QWORD PTR[64+rdi],r13 + mov QWORD PTR[72+rdi],r13 + mov QWORD PTR[80+rdi],r13 + mov QWORD PTR[88+rdi],r13 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + + xor rsi,256+8*12 + mov edi,62 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[48+rsi] + mov r11,QWORD PTR[56+rsi] + call __inner_loop_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + mov QWORD PTR[rdi],r8 + mov QWORD PTR[48+rdi],r10 + + + + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[96+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + + + xor rsi,256+8*12 + mov edi,22 + + mov r8,QWORD PTR[rsi] + xor r9,r9 + mov r10,QWORD PTR[48+rsi] + xor r11,r11 + call __inner_loop_62 + + + + + + + + lea rsi,QWORD PTR[96+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulq_767x63 + + mov rsi,QWORD PTR[40+rsp] + mov rdx,rax + sar rax,63 + + mov r8,rax + mov r9,rax + mov r10,rax + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + mov r11,rax + and r10,QWORD PTR[16+rsi] + and r11,QWORD PTR[24+rsi] + mov r12,rax + and r12,QWORD PTR[32+rsi] + and rax,QWORD PTR[40+rsi] + + add r14,r8 + adc r15,r9 + adc rbx,r10 + adc rbp,r11 + adc rcx,r12 + adc rdx,rax + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],rbx + mov QWORD PTR[72+rdi],rbp + mov QWORD PTR[80+rdi],rcx + mov QWORD PTR[88+rdi],rdx + + lea r8,QWORD PTR[1112+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_inverse_mod_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_inverse_mod_383:: +ct_inverse_mod_383 ENDP + +ALIGN 32 +__smulq_767x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + lea rsi,QWORD PTR[48+rsi] + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov QWORD PTR[rdi],rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mov QWORD PTR[8+rdi],r9 + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mov QWORD PTR[16+rdi],r10 + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mov QWORD PTR[24+rdi],r11 + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mov QWORD PTR[32+rdi],r12 + imul rbp + add r13,rax + adc rdx,0 + + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],rdx + sar rdx,63 + mov QWORD PTR[56+rdi],rdx + mov rdx,rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + mov r15,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[64+rsi] + mov rbp,QWORD PTR[72+rsi] + mov rcx,QWORD PTR[80+rsi] + mov rdi,QWORD PTR[88+rsi] + + mov rsi,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rsi,rdx + add rsi,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + xor rbx,rdx + xor rbp,rdx + xor rcx,rdx + xor rdi,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rbx,0 + adc rbp,0 + adc rcx,0 + adc rdi,0 + + mul rsi + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rsi + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rsi + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rsi + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rsi + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mul rsi + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rsi + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + mul rsi + add r15,rax + mov rax,rbx + adc rdx,0 + mov rbx,rdx + mul rsi + add rbx,rax + mov rax,rbp + adc rdx,0 + mov rbp,rdx + mul rsi + add rbp,rax + mov rax,rcx + adc rdx,0 + mov rcx,rdx + mul rsi + add rcx,rax + mov rax,rdi + adc rdx,0 + mov rdi,rdx + mov rdx,QWORD PTR[8+rsp] + imul rax,rsi + mov rsi,QWORD PTR[16+rsp] + add rax,rdi + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + adc r11,QWORD PTR[24+rdx] + adc r12,QWORD PTR[32+rdx] + adc r13,QWORD PTR[40+rdx] + adc r14,QWORD PTR[48+rdx] + mov rdi,QWORD PTR[56+rdx] + adc r15,rdi + adc rbx,rdi + adc rbp,rdi + adc rcx,rdi + adc rax,rdi + + mov rdi,rdx + + mov QWORD PTR[rdx],r8 + mov QWORD PTR[8+rdx],r9 + mov QWORD PTR[16+rdx],r10 + mov QWORD PTR[24+rdx],r11 + mov QWORD PTR[32+rdx],r12 + mov QWORD PTR[40+rdx],r13 + mov QWORD PTR[48+rdx],r14 + mov QWORD PTR[56+rdx],r15 + mov QWORD PTR[64+rdx],rbx + mov QWORD PTR[72+rdx],rbp + mov QWORD PTR[80+rdx],rcx + mov QWORD PTR[88+rdx],rax + + DB 0F3h,0C3h ;repret +__smulq_767x63 ENDP + +ALIGN 32 +__smulq_383x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rax,rbp + add r13,rax + + lea rsi,QWORD PTR[48+rsi] + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rax,rbp + add r13,rax + + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulq_383x63 ENDP + +ALIGN 32 +__smulq_383_n_shift_by_62 PROC PRIVATE + DB 243,15,30,250 + mov rbx,rdx + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rbp + add r13,rax + adc rdx,0 + + lea rsi,QWORD PTR[48+rsi] + mov r14,rdx + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rbp + add r13,rax + adc rdx,0 + + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + adc r14,rdx + mov rdx,rbx + + shrd r8,r9,62 + shrd r9,r10,62 + shrd r10,r11,62 + shrd r11,r12,62 + shrd r12,r13,62 + shrd r13,r14,62 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulq_383_n_shift_by_62 ENDP + +ALIGN 32 +__ab_approximation_62 PROC PRIVATE + DB 243,15,30,250 + mov r9,QWORD PTR[40+rsi] + mov r11,QWORD PTR[88+rsi] + mov rbx,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[80+rsi] + mov r8,QWORD PTR[24+rsi] + mov r10,QWORD PTR[72+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[16+rsi] + mov r10,QWORD PTR[64+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[8+rsi] + mov r10,QWORD PTR[56+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[rsi] + mov r10,QWORD PTR[48+rsi] + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,rbx + cmovz r11,rbp + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + jmp __inner_loop_62 + + DB 0F3h,0C3h ;repret +__ab_approximation_62 ENDP + +ALIGN 8 + DD 0 +__inner_loop_62 PROC PRIVATE + DB 243,15,30,250 + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,1 + mov QWORD PTR[8+rsp],rsi + +$L$oop_62:: + xor rax,rax + xor rbx,rbx + test r8,1 + mov rbp,r10 + mov r14,r11 + cmovnz rax,r10 + cmovnz rbx,r11 + sub rbp,r8 + sbb r14,r9 + mov r15,r8 + mov rsi,r9 + sub r8,rax + sbb r9,rbx + cmovc r8,rbp + cmovc r9,r14 + cmovc r10,r15 + cmovc r11,rsi + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shrd r8,r9,1 + shr r9,1 + test r15,1 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub edi,1 + jnz $L$oop_62 + + mov rsi,QWORD PTR[8+rsp] + DB 0F3h,0C3h ;repret +__inner_loop_62 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_inverse_mod_383 + DD imagerel $L$SEH_body_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_prologue + + DD imagerel $L$SEH_body_ct_inverse_mod_383 + DD imagerel $L$SEH_epilogue_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_body + + DD imagerel $L$SEH_epilogue_ct_inverse_mod_383 + DD imagerel $L$SEH_end_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_inverse_mod_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_ct_inverse_mod_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,08bh,000h +DB 000h,0e4h,08ch,000h +DB 000h,0d4h,08dh,000h +DB 000h,0c4h,08eh,000h +DB 000h,034h,08fh,000h +DB 000h,054h,090h,000h +DB 000h,074h,092h,000h +DB 000h,064h,093h,000h +DB 000h,001h,091h,000h +$L$SEH_info_ct_inverse_mod_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm new file mode 100644 index 00000000000..df4c46a4c44 --- /dev/null +++ b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm @@ -0,0 +1,1597 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ctx_inverse_mod_383 + +ALIGN 32 +ctx_inverse_mod_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ctx_inverse_mod_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1112 + +$L$SEH_body_ctx_inverse_mod_383:: + + + lea rax,QWORD PTR[((88+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,QWORD PTR[rdx] + mov r15,QWORD PTR[8+rdx] + mov rbx,QWORD PTR[16+rdx] + mov rbp,QWORD PTR[24+rdx] + mov rsi,QWORD PTR[32+rdx] + mov rdi,QWORD PTR[40+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rbp + mov QWORD PTR[80+rax],rsi + mov rsi,rax + mov QWORD PTR[88+rax],rdi + + + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + + + mov QWORD PTR[96+rdi],rdx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + + + mov QWORD PTR[96+rdi],rdx + + + xor rsi,256 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + + + + mov rax,QWORD PTR[96+rsi] + mov r11,QWORD PTR[144+rsi] + mov rbx,rdx + mov r10,rax + imul QWORD PTR[56+rsp] + mov r8,rax + mov rax,r11 + mov r9,rdx + imul QWORD PTR[64+rsp] + add r8,rax + adc r9,rdx + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + sar r9,63 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + lea rsi,QWORD PTR[96+rsi] + + mov rax,r10 + imul rbx + mov r8,rax + mov rax,r11 + mov r9,rdx + imul rcx + add r8,rax + adc r9,rdx + mov QWORD PTR[96+rdi],r8 + mov QWORD PTR[104+rdi],r9 + sar r9,63 + mov QWORD PTR[112+rdi],r9 + mov QWORD PTR[120+rdi],r9 + mov QWORD PTR[128+rdi],r9 + mov QWORD PTR[136+rdi],r9 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + sar r13,63 + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r13 + mov QWORD PTR[64+rdi],r13 + mov QWORD PTR[72+rdi],r13 + mov QWORD PTR[80+rdi],r13 + mov QWORD PTR[88+rdi],r13 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + + xor rsi,256+8*12 + mov edi,53 + + mov r8,QWORD PTR[rsi] + + mov r10,QWORD PTR[48+rsi] + + call __inner_loop_62 + + + + + + + + lea rsi,QWORD PTR[96+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulx_767x63 + + mov rsi,QWORD PTR[40+rsp] + mov rdx,rax + sar rax,63 + + mov r8,rax + mov r9,rax + mov r10,rax + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + mov r11,rax + and r10,QWORD PTR[16+rsi] + and r11,QWORD PTR[24+rsi] + mov r12,rax + and r12,QWORD PTR[32+rsi] + and rax,QWORD PTR[40+rsi] + + add r14,r8 + adc r15,r9 + adc rbx,r10 + adc rbp,r11 + adc rcx,r12 + adc rdx,rax + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],rbx + mov QWORD PTR[72+rdi],rbp + mov QWORD PTR[80+rdi],rcx + mov QWORD PTR[88+rdi],rdx + + lea r8,QWORD PTR[1112+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ctx_inverse_mod_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ctx_inverse_mod_383:: +ctx_inverse_mod_383 ENDP + +ALIGN 32 +__smulx_767x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + lea rsi,QWORD PTR[48+rsi] + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc rdx,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + mov QWORD PTR[48+rdi],rdx + sar rdx,63 + mov QWORD PTR[56+rdi],rdx + mov rdx,rcx + mov rax,rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + mov r15,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[64+rsi] + mov rbp,QWORD PTR[72+rsi] + mov rcx,QWORD PTR[80+rsi] + mov rdi,QWORD PTR[88+rsi] + + sar rax,63 + xor rsi,rsi + sub rsi,rax + + xor rdx,rax + add rdx,rsi + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor r13,rax + xor r14,rax + xor r15,rax + xor rbx,rax + xor rbp,rax + xor rcx,rax + xor rdi,rax + add r8,rsi + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rbx,0 + adc rbp,0 + adc rcx,0 + adc rdi,0 + + mulx rax,r8,r8 + mulx rsi,r9,r9 + add r9,rax + mulx rax,r10,r10 + adc r10,rsi + mulx rsi,r11,r11 + adc r11,rax + mulx rax,r12,r12 + adc r12,rsi + mulx rsi,r13,r13 + adc r13,rax + mulx rax,r14,r14 + adc r14,rsi + mulx rsi,r15,r15 + adc r15,rax + mulx rax,rbx,rbx + adc rbx,rsi + mulx rsi,rbp,rbp + adc rbp,rax + mulx rax,rcx,rcx + adc rcx,rsi + mulx rsi,rdi,rdi + mov rdx,QWORD PTR[8+rsp] + mov rsi,QWORD PTR[16+rsp] + adc rax,rdi + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + adc r11,QWORD PTR[24+rdx] + adc r12,QWORD PTR[32+rdx] + adc r13,QWORD PTR[40+rdx] + adc r14,QWORD PTR[48+rdx] + mov rdi,QWORD PTR[56+rdx] + adc r15,rdi + adc rbx,rdi + adc rbp,rdi + adc rcx,rdi + adc rax,rdi + + mov rdi,rdx + + mov QWORD PTR[rdx],r8 + mov QWORD PTR[8+rdx],r9 + mov QWORD PTR[16+rdx],r10 + mov QWORD PTR[24+rdx],r11 + mov QWORD PTR[32+rdx],r12 + mov QWORD PTR[40+rdx],r13 + mov QWORD PTR[48+rdx],r14 + mov QWORD PTR[56+rdx],r15 + mov QWORD PTR[64+rdx],rbx + mov QWORD PTR[72+rdx],rbp + mov QWORD PTR[80+rdx],rcx + mov QWORD PTR[88+rdx],rax + + DB 0F3h,0C3h ;repret +__smulx_767x63 ENDP + +ALIGN 32 +__smulx_383x63 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov r12,QWORD PTR[((0+32))+rsi] + mov r13,QWORD PTR[((0+40))+rsi] + + mov rbp,rdx + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rdx,rbp + add rdx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + xor r12,rbp + xor r13,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mulx rbp,r8,r8 + mulx rax,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,rax + mulx rax,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,rax + mulx rax,r13,r13 + mov rdx,rcx + adc r13,rbp + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + mov rbp,rdx + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rdx,rbp + add rdx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + xor r12,rbp + xor r13,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mulx rbp,r8,r8 + mulx rax,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,rax + mulx rax,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,rax + mulx rax,r13,r13 + adc r13,rbp + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulx_383x63 ENDP + +ALIGN 32 +__smulx_383_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + mov rbx,rdx + xor r14,r14 + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov r12,QWORD PTR[((0+32))+rsi] + mov r13,QWORD PTR[((0+40))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc r14,rdx + + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc rdx,0 + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc rax,QWORD PTR[40+rdi] + adc r14,rdx + mov rdx,rbx + + shrd r8,r9,31 + shrd r9,r10,31 + shrd r10,r11,31 + shrd r11,r12,31 + shrd r12,rax,31 + shrd rax,r14,31 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor rax,r14 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulx_383_n_shift_by_31 ENDP + +ALIGN 32 +__smulx_191_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + mov rbx,rdx + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor rax,r10 + add r8,rbp + adc r9,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r10,r9,r9 + add r9,rbp + adc r10,0 + imul rdx + add r10,rax + adc rdx,0 + mov r14,rdx + mov rdx,rcx + mov r11,QWORD PTR[((48+0))+rsi] + mov r12,QWORD PTR[((48+8))+rsi] + mov r13,QWORD PTR[((48+16))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r11,rax + xor r12,rax + xor rax,r13 + add r11,rbp + adc r12,0 + adc rax,0 + + mulx rbp,r11,r11 + mulx r13,r12,r12 + add r12,rbp + adc r13,0 + imul rdx + add r13,rax + adc rdx,0 + add r11,r8 + adc r12,r9 + adc r13,r10 + adc r14,rdx + mov rdx,rbx + + shrd r11,r12,31 + shrd r12,r13,31 + shrd r13,r14,31 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r11,rbp + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r11 + mov QWORD PTR[8+rdi],r12 + mov QWORD PTR[16+rdi],r13 + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulx_191_n_shift_by_31 ENDP + +ALIGN 32 +__ab_approximation_31 PROC PRIVATE + DB 243,15,30,250 + mov r9,QWORD PTR[40+rsi] + mov r11,QWORD PTR[88+rsi] + mov rbx,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[80+rsi] + mov r8,QWORD PTR[24+rsi] + mov r10,QWORD PTR[72+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[16+rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[64+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[8+rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[56+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[48+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,r8 + cmovz r11,r10 + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + mov eax,07FFFFFFFh + and r8,rax + and r10,rax + andn r9,rax,r9 + andn r11,rax,r11 + or r8,r9 + or r10,r11 + + jmp __inner_loop_31 + + DB 0F3h,0C3h ;repret +__ab_approximation_31 ENDP + +ALIGN 32 +__inner_loop_31 PROC PRIVATE + DB 243,15,30,250 + mov rcx,07FFFFFFF80000000h + mov r13,0800000007FFFFFFFh + mov r15,07FFFFFFF7FFFFFFFh + +$L$oop_31:: + cmp r8,r10 + mov rax,r8 + mov rbx,r10 + mov rbp,rcx + mov r14,r13 + cmovb r8,r10 + cmovb r10,rax + cmovb rcx,r13 + cmovb r13,rbp + + sub r8,r10 + sub rcx,r13 + add rcx,r15 + + test rax,1 + cmovz r8,rax + cmovz r10,rbx + cmovz rcx,rbp + cmovz r13,r14 + + shr r8,1 + add r13,r13 + sub r13,r15 + sub edi,1 + jnz $L$oop_31 + + shr r15,32 + mov edx,ecx + mov r12d,r13d + shr rcx,32 + shr r13,32 + sub rdx,r15 + sub rcx,r15 + sub r12,r15 + sub r13,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_31 ENDP + + +ALIGN 32 +__inner_loop_62 PROC PRIVATE + DB 243,15,30,250 + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,1 + +$L$oop_62:: + xor rax,rax + test r8,1 + mov rbx,r10 + cmovnz rax,r10 + sub rbx,r8 + mov rbp,r8 + sub r8,rax + cmovc r8,rbx + cmovc r10,rbp + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shr r8,1 + test rbp,1 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub edi,1 + jnz $L$oop_62 + + DB 0F3h,0C3h ;repret +__inner_loop_62 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ctx_inverse_mod_383 + DD imagerel $L$SEH_body_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_prologue + + DD imagerel $L$SEH_body_ctx_inverse_mod_383 + DD imagerel $L$SEH_epilogue_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_body + + DD imagerel $L$SEH_epilogue_ctx_inverse_mod_383 + DD imagerel $L$SEH_end_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ctx_inverse_mod_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_ctx_inverse_mod_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,08bh,000h +DB 000h,0e4h,08ch,000h +DB 000h,0d4h,08dh,000h +DB 000h,0c4h,08eh,000h +DB 000h,034h,08fh,000h +DB 000h,054h,090h,000h +DB 000h,074h,092h,000h +DB 000h,064h,093h,000h +DB 000h,001h,091h,000h +$L$SEH_info_ctx_inverse_mod_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/div3w-armv8.asm b/crypto/blst_src/build/win64/div3w-armv8.asm new file mode 100644 index 00000000000..7114ccf0c2e --- /dev/null +++ b/crypto/blst_src/build/win64/div3w-armv8.asm @@ -0,0 +1,89 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |div_3_limbs|[FUNC] + ALIGN 32 +|div_3_limbs| PROC + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +|$Loop| + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csello x4,x4,x6 + extr x1,x2,x1,#1 // D >>= 1 + csello x5,x5,x7 + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,|$Loop| + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + ENDP + + EXPORT |quot_rem_128|[FUNC] + ALIGN 32 +|quot_rem_128| PROC + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + ENDP + + + EXPORT |quot_rem_64|[FUNC] + ALIGN 32 +|quot_rem_64| PROC + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/div3w-x86_64.asm b/crypto/blst_src/build/win64/div3w-x86_64.asm new file mode 100644 index 00000000000..c35f426f3d2 --- /dev/null +++ b/crypto/blst_src/build/win64/div3w-x86_64.asm @@ -0,0 +1,152 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC div_3_limbs + + +ALIGN 32 +div_3_limbs PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_div_3_limbs:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + xor rax,rax + mov ecx,64 + +$L$oop:: + mov r10,r8 + sub r8,rsi + mov r11,r9 + sbb r9,rdx + lea rax,QWORD PTR[1+rax*1+rax] + mov rdi,rdx + cmovc r8,r10 + cmovc r9,r11 + sbb rax,0 + shl rdi,63 + shr rsi,1 + shr rdx,1 + or rsi,rdi + sub ecx,1 + jnz $L$oop + + lea rcx,QWORD PTR[1+rax*1+rax] + sar rax,63 + + sub r8,rsi + sbb r9,rdx + sbb rcx,0 + + or rax,rcx + + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_div_3_limbs:: +div_3_limbs ENDP +PUBLIC quot_rem_128 + + +ALIGN 32 +quot_rem_128 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_quot_rem_128:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + mov rax,rdx + mov rcx,rdx + + mul QWORD PTR[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r9,rax + adc rdx,0 + + mov r10,QWORD PTR[rdi] + mov r11,QWORD PTR[8+rdi] + mov rax,QWORD PTR[16+rdi] + + sub r10,r8 + sbb r11,r9 + sbb rax,rdx + sbb r8,r8 + + add rcx,r8 + mov r9,r8 + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + add r10,r8 + adc r11,r9 + + mov QWORD PTR[rdi],r10 + mov QWORD PTR[8+rdi],r11 + mov QWORD PTR[16+rdi],rcx + + mov rax,rcx + + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_quot_rem_128:: +quot_rem_128 ENDP + + + + + +PUBLIC quot_rem_64 + + +ALIGN 32 +quot_rem_64 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_quot_rem_64:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + mov rax,rdx + imul rdx,QWORD PTR[rsi] + + mov r10,QWORD PTR[rdi] + + sub r10,rdx + + mov QWORD PTR[rdi],r10 + mov QWORD PTR[8+rdi],rax + + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_quot_rem_64:: +quot_rem_64 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/dll.c b/crypto/blst_src/build/win64/dll.c new file mode 100644 index 00000000000..a70d0c98a23 --- /dev/null +++ b/crypto/blst_src/build/win64/dll.c @@ -0,0 +1,32 @@ +#include + +#if defined(_MSC_VER) +/* + * Even though we don't have memcpy/memset anywhere, MSVC compiler + * generates calls to them as it recognizes corresponding patterns. + */ +void *memcpy(unsigned char *dst, const unsigned char *src, size_t n) +{ + void *ret = dst; + + while(n--) + *dst++ = *src++; + + return ret; +} + +void *memset(unsigned char *dst, int c, size_t n) +{ + void *ret = dst; + + while(n--) + *dst++ = (unsigned char)c; + + return ret; +} +#elif defined(__GNUC__) +# pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) +{ return TRUE; } diff --git a/crypto/blst_src/build/win64/mul_mont_256-armv8.asm b/crypto/blst_src/build/win64/mul_mont_256-armv8.asm new file mode 100644 index 00000000000..bb2dfe043c7 --- /dev/null +++ b/crypto/blst_src/build/win64/mul_mont_256-armv8.asm @@ -0,0 +1,465 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |mul_mont_sparse_256|[FUNC] + ALIGN 32 +|mul_mont_sparse_256| PROC + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csello x19,x19,x14 + csello x20,x20,x15 + csello x21,x21,x16 + csello x22,x22,x17 + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + ENDP + + + EXPORT |sqr_mont_sparse_256|[FUNC] + ALIGN 32 +|sqr_mont_sparse_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + EXPORT |from_mont_256|[FUNC] + ALIGN 32 +|from_mont_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |redc_mont_256|[FUNC] + ALIGN 32 +|redc_mont_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_by_1_mont_256| PROC + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/mul_mont_384-armv8.asm b/crypto/blst_src/build/win64/mul_mont_384-armv8.asm new file mode 100644 index 00000000000..a309dfa4121 --- /dev/null +++ b/crypto/blst_src/build/win64/mul_mont_384-armv8.asm @@ -0,0 +1,2373 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |add_mod_384x384|[FUNC] + ALIGN 32 +|add_mod_384x384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__add_mod_384x384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + stp x11,x12,[x0,#48] + csello x15,x15,x23 + stp x13,x14,[x0,#64] + csello x16,x16,x24 + stp x15,x16,[x0,#80] + + ret + ENDP + + + EXPORT |sub_mod_384x384|[FUNC] + ALIGN 32 +|sub_mod_384x384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__sub_mod_384x384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + ENDP + + + ALIGN 32 +|__add_mod_384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + stp x11,x12,[x0] + csello x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + ALIGN 32 +|__sub_mod_384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + + EXPORT |mul_mont_384x|[FUNC] + ALIGN 32 +|mul_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_mont_384x|[FUNC] + ALIGN 32 +|sqr_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csello x19,x11,x19 + csello x20,x12,x20 + csello x21,x13,x21 + ldp x11,x12,[sp] + csello x22,x14,x22 + ldr x17, [sp,#48] + csello x23,x15,x23 + ldp x13,x14,[sp,#16] + csello x24,x16,x24 + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_mont_384|[FUNC] + ALIGN 32 +|mul_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_mont_384| PROC + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csello x11,x19,x26 + csello x12,x20,x27 + csello x13,x21,x28 + csello x14,x22,x0 + csello x15,x23,x1 + csello x16,x24,x3 + ret + ENDP + + + + EXPORT |sqr_mont_384|[FUNC] + ALIGN 32 +|sqr_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_n_mul_mont_383|[FUNC] + ALIGN 32 +|sqr_n_mul_mont_383| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +|$Loop_sqr_383| + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,|$Loop_sqr_383| + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + ALIGN 32 +|__sqr_384| PROC + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + ENDP + + + EXPORT |sqr_384|[FUNC] + ALIGN 32 +|sqr_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |redc_mont_384|[FUNC] + ALIGN 32 +|redc_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |from_mont_384|[FUNC] + ALIGN 32 +|from_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + csello x16,x16,x24 + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_by_1_mont_384| PROC + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + ENDP + + + ALIGN 32 +|__redc_tail_mont_384| PROC + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + csello x16,x16,x24 + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + + EXPORT |mul_384|[FUNC] + ALIGN 32 +|mul_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_384| PROC + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + ENDP + + + + EXPORT |mul_382x|[FUNC] + ALIGN 32 +|mul_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_382x|[FUNC] + ALIGN 32 +|sqr_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_mont_382x|[FUNC] + ALIGN 32 +|sqr_mont_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_mont_383_nonred| PROC + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + ENDP + + + + EXPORT |sgn0_pty_mont_384|[FUNC] + ALIGN 32 +|sgn0_pty_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sgn0_pty_mont_384x|[FUNC] + ALIGN 32 +|sgn0_pty_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + cseleq x3,x0,x2 + + cmp x1,#0 + cselne x1,x0,x2 + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm new file mode 100644 index 00000000000..c3bf8634617 --- /dev/null +++ b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm @@ -0,0 +1,884 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC mul_mont_sparse_256 + + +ALIGN 32 +mul_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_mul_mont_sparse_256:: + + + mov rax,QWORD PTR[rdx] + mov r13,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov rbp,QWORD PTR[24+rsi] + mov rbx,rdx + + mov r15,rax + mul r13 + mov r9,rax + mov rax,r15 + mov r10,rdx + call __mulq_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_sparse_256:: +mul_mont_sparse_256 ENDP + +PUBLIC sqr_mont_sparse_256 + + +ALIGN 32 +sqr_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_sqr_mont_sparse_256:: + + + mov rax,QWORD PTR[rsi] + mov r8,rcx + mov r14,QWORD PTR[8+rsi] + mov rcx,rdx + mov r12,QWORD PTR[16+rsi] + lea rbx,QWORD PTR[rsi] + mov rbp,QWORD PTR[24+rsi] + + mov r15,rax + mul rax + mov r9,rax + mov rax,r15 + mov r10,rdx + call __mulq_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_sparse_256:: +sqr_mont_sparse_256 ENDP + +ALIGN 32 +__mulq_mont_sparse_256 PROC PRIVATE + DB 243,15,30,250 + mul r14 + add r10,rax + mov rax,r15 + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + mul rbp + add r12,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + xor r14,r14 + mov r13,rdx + + mov rdi,r9 + imul r9,r8 + + + mov r15,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,r15 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,r15 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,r15 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc r14,rdx + xor r15,r15 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r9 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r12,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r12,rbp + adc rdx,0 + add r13,rdx + adc r14,0 + adc r15,0 + mov rdi,r10 + imul r10,r8 + + + mov r9,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,r9 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,r9 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc r15,rdx + xor r9,r9 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r10 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r13,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r13,rbp + adc rdx,0 + add r14,rdx + adc r15,0 + adc r9,0 + mov rdi,r11 + imul r11,r8 + + + mov r10,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,r10 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,r10 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,rbp + adc r9,rdx + xor r10,r10 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r11 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,rbp + adc rdx,0 + add r15,rdx + adc r9,0 + adc r10,0 + imul rax,r8 + mov rsi,QWORD PTR[8+rsp] + + + mov r11,rax + mul QWORD PTR[rcx] + add r12,rax + mov rax,r11 + adc r12,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r12 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r11 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + mov rbx,r14 + add r15,rbp + adc rdx,0 + add r15,rax + mov rax,r13 + adc rdx,0 + add r9,rdx + adc r10,0 + + + + + mov r12,r15 + sub r13,QWORD PTR[rcx] + sbb r14,QWORD PTR[8+rcx] + sbb r15,QWORD PTR[16+rcx] + mov rbp,r9 + sbb r9,QWORD PTR[24+rcx] + sbb r10,0 + + cmovc r13,rax + cmovc r14,rbx + cmovc r15,r12 + mov QWORD PTR[rsi],r13 + cmovc r9,rbp + mov QWORD PTR[8+rsi],r14 + mov QWORD PTR[16+rsi],r15 + mov QWORD PTR[24+rsi],r9 + + DB 0F3h,0C3h ;repret + +__mulq_mont_sparse_256 ENDP +PUBLIC from_mont_256 + + +ALIGN 32 +from_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_from_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_from_mont_256:: + + + mov rbx,rdx + call __mulq_by_1_mont_256 + + + + + + mov r10,r14 + mov r11,r15 + mov r12,r9 + + sub r13,QWORD PTR[rbx] + sbb r14,QWORD PTR[8+rbx] + sbb r15,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + + cmovnc rax,r13 + cmovnc r10,r14 + cmovnc r11,r15 + mov QWORD PTR[rdi],rax + cmovnc r12,r9 + mov QWORD PTR[8+rdi],r10 + mov QWORD PTR[16+rdi],r11 + mov QWORD PTR[24+rdi],r12 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_from_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_from_mont_256:: +from_mont_256 ENDP + +PUBLIC redc_mont_256 + + +ALIGN 32 +redc_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redc_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redc_mont_256:: + + + mov rbx,rdx + call __mulq_by_1_mont_256 + + add r13,QWORD PTR[32+rsi] + adc r14,QWORD PTR[40+rsi] + mov rax,r13 + adc r15,QWORD PTR[48+rsi] + mov r10,r14 + adc r9,QWORD PTR[56+rsi] + sbb rsi,rsi + + + + + mov r11,r15 + sub r13,QWORD PTR[rbx] + sbb r14,QWORD PTR[8+rbx] + sbb r15,QWORD PTR[16+rbx] + mov r12,r9 + sbb r9,QWORD PTR[24+rbx] + sbb rsi,0 + + cmovnc rax,r13 + cmovnc r10,r14 + cmovnc r11,r15 + mov QWORD PTR[rdi],rax + cmovnc r12,r9 + mov QWORD PTR[8+rdi],r10 + mov QWORD PTR[16+rdi],r11 + mov QWORD PTR[24+rdi],r12 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redc_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redc_mont_256:: +redc_mont_256 ENDP + +ALIGN 32 +__mulq_by_1_mont_256 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r10,QWORD PTR[8+rsi] + mov r11,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + + mov r13,rax + imul rax,rcx + mov r9,rax + + mul QWORD PTR[rbx] + add r13,rax + mov rax,r9 + adc r13,rdx + + mul QWORD PTR[8+rbx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[16+rbx] + mov r14,r10 + imul r10,rcx + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[24+rbx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r10 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + mov r15,r11 + imul r11,rcx + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r11 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + mov r9,r12 + imul r12,rcx + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r9,rax + mov rax,r12 + adc r9,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + DB 0F3h,0C3h ;repret +__mulq_by_1_mont_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mul_mont_sparse_256 + DD imagerel $L$SEH_body_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_mul_mont_sparse_256 + DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 + DD imagerel $L$SEH_end_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_sparse_256 + DD imagerel $L$SEH_body_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_sqr_mont_sparse_256 + DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 + DD imagerel $L$SEH_end_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_from_mont_256 + DD imagerel $L$SEH_body_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_prologue + + DD imagerel $L$SEH_body_from_mont_256 + DD imagerel $L$SEH_epilogue_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_body + + DD imagerel $L$SEH_epilogue_from_mont_256 + DD imagerel $L$SEH_end_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_epilogue + + DD imagerel $L$SEH_begin_redc_mont_256 + DD imagerel $L$SEH_body_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_prologue + + DD imagerel $L$SEH_body_redc_mont_256 + DD imagerel $L$SEH_epilogue_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_body + + DD imagerel $L$SEH_epilogue_redc_mont_256 + DD imagerel $L$SEH_end_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mul_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mul_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_from_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_from_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_from_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redc_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redc_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redc_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm new file mode 100644 index 00000000000..0ccb46786c3 --- /dev/null +++ b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm @@ -0,0 +1,4233 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + + + + + + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__sub_mod_384_a_is_loaded:: + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP +PUBLIC mul_mont_384x + + +ALIGN 32 +mul_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,328 + +$L$SEH_body_mul_mont_384x:: + + + mov rbx,rdx + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[24+rsp],rsi + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[8+rsp],rcx + mov QWORD PTR[rsp],r8 + + + + + lea rdi,QWORD PTR[40+rsp] + call __mulq_384 + + + lea rbx,QWORD PTR[48+rbx] + lea rsi,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((40+96))+rsp] + call __mulq_384 + + + mov rcx,QWORD PTR[8+rsp] + lea rdx,QWORD PTR[((-48))+rsi] + lea rdi,QWORD PTR[((40+192+48))+rsp] + call __add_mod_384 + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + lea rbx,QWORD PTR[rdi] + lea rsi,QWORD PTR[48+rdi] + call __mulq_384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[8+rsp] + call __sub_mod_384x384 + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[40+rsp] + lea rdx,QWORD PTR[((40+96))+rsp] + lea rdi,QWORD PTR[40+rsp] + call __sub_mod_384x384 + + mov rbx,rcx + + + lea rsi,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[rsp] + mov rdi,QWORD PTR[32+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + lea rsi,QWORD PTR[((40+192))+rsp] + mov rcx,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[328+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mul_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_384x:: +mul_mont_384x ENDP +PUBLIC sqr_mont_384x + + +ALIGN 32 +sqr_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqr_mont_384x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[32+rsp] + call __add_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((32+48))+rsp] + call __sub_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rbx,QWORD PTR[48+rsi] + + mov rax,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + call __mulq_mont_384 + add r14,r14 + adc r15,r15 + adc r8,r8 + mov r12,r14 + adc r9,r9 + mov r13,r15 + adc r10,r10 + mov rax,r8 + adc r11,r11 + mov rbx,r9 + sbb rdx,rdx + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov rbp,r10 + sbb r8,QWORD PTR[16+rcx] + sbb r9,QWORD PTR[24+rcx] + sbb r10,QWORD PTR[32+rcx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r14,r12 + cmovc r15,r13 + cmovc r8,rax + mov QWORD PTR[48+rdi],r14 + cmovc r9,rbx + mov QWORD PTR[56+rdi],r15 + cmovc r10,rbp + mov QWORD PTR[64+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r10 + mov QWORD PTR[88+rdi],r11 + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rax,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov r12,QWORD PTR[((32+16))+rsp] + mov r13,QWORD PTR[((32+24))+rsp] + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_384x:: +sqr_mont_384x ENDP + +PUBLIC mul_382x + + +ALIGN 32 +mul_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_mul_382x:: + + + lea rdi,QWORD PTR[96+rdi] + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rcx + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[48+rsi] + adc r9,QWORD PTR[56+rsi] + adc r10,QWORD PTR[64+rsi] + adc r11,QWORD PTR[72+rsi] + adc r12,QWORD PTR[80+rsi] + adc r13,QWORD PTR[88+rsi] + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + add r8,QWORD PTR[48+rdx] + adc r9,QWORD PTR[56+rdx] + adc r10,QWORD PTR[64+rdx] + adc r11,QWORD PTR[72+rdx] + adc r12,QWORD PTR[80+rdx] + adc r13,QWORD PTR[88+rdx] + + mov QWORD PTR[((32+48))+rsp],r8 + mov QWORD PTR[((32+56))+rsp],r9 + mov QWORD PTR[((32+64))+rsp],r10 + mov QWORD PTR[((32+72))+rsp],r11 + mov QWORD PTR[((32+80))+rsp],r12 + mov QWORD PTR[((32+88))+rsp],r13 + + + lea rsi,QWORD PTR[((32+0))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + call __mulq_384 + + + mov rsi,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __mulq_384 + + + lea rsi,QWORD PTR[48+rsi] + lea rbx,QWORD PTR[48+rbx] + lea rdi,QWORD PTR[32+rsp] + call __mulq_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[32+rsp] + mov rcx,QWORD PTR[24+rsp] + mov rdi,rsi + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[((-96))+rdi] + lea rdx,QWORD PTR[32+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mul_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_382x:: +mul_382x ENDP +PUBLIC sqr_382x + + +ALIGN 32 +sqr_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_sqr_382x:: + + + mov rcx,rdx + + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rdx,QWORD PTR[40+rsi] + + mov r8,r14 + add r14,QWORD PTR[48+rsi] + mov r9,r15 + adc r15,QWORD PTR[56+rsi] + mov r10,rax + adc rax,QWORD PTR[64+rsi] + mov r11,rbx + adc rbx,QWORD PTR[72+rsi] + mov r12,rbp + adc rbp,QWORD PTR[80+rsi] + mov r13,rdx + adc rdx,QWORD PTR[88+rsi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],rax + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rdx + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384_a_is_loaded + + + lea rsi,QWORD PTR[rdi] + lea rbx,QWORD PTR[((-48))+rdi] + lea rdi,QWORD PTR[((-48))+rdi] + call __mulq_384 + + + mov rsi,QWORD PTR[rsp] + lea rbx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulq_384 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + mov r14,QWORD PTR[48+rdi] + mov r15,QWORD PTR[56+rdi] + mov rax,QWORD PTR[64+rdi] + mov rbx,QWORD PTR[72+rdi] + mov rbp,QWORD PTR[80+rdi] + add r8,r8 + mov rdx,QWORD PTR[88+rdi] + adc r9,r9 + mov QWORD PTR[rdi],r8 + adc r10,r10 + mov QWORD PTR[8+rdi],r9 + adc r11,r11 + mov QWORD PTR[16+rdi],r10 + adc r12,r12 + mov QWORD PTR[24+rdi],r11 + adc r13,r13 + mov QWORD PTR[32+rdi],r12 + adc r14,r14 + mov QWORD PTR[40+rdi],r13 + adc r15,r15 + mov QWORD PTR[48+rdi],r14 + adc rax,rax + mov QWORD PTR[56+rdi],r15 + adc rbx,rbx + mov QWORD PTR[64+rdi],rax + adc rbp,rbp + mov QWORD PTR[72+rdi],rbx + adc rdx,rdx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rdx + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_382x:: +sqr_382x ENDP +PUBLIC mul_384 + + +ALIGN 32 +mul_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + +$L$SEH_body_mul_384:: + + + mov rbx,rdx + call __mulq_384 + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_mul_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_384:: +mul_384 ENDP + + +ALIGN 32 +__mulq_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rbx] + + mov rbp,rax + mul QWORD PTR[rsi] + mov QWORD PTR[rdi],rax + mov rax,rbp + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r11,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[8+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[16+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[24+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[32+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[40+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,rax + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov QWORD PTR[48+rdi],rcx + mov QWORD PTR[56+rdi],r8 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r10 + mov QWORD PTR[80+rdi],r11 + mov QWORD PTR[88+rdi],r12 + + DB 0F3h,0C3h ;repret +__mulq_384 ENDP +PUBLIC sqr_384 + + +ALIGN 32 +sqr_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_384:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sqr_384:: + + + call __sqrq_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_384:: +sqr_384 ENDP + + +ALIGN 32 +__sqrq_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rcx,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + + + mov r14,rax + mul r15 + mov r9,rax + mov rax,r14 + mov rbp,QWORD PTR[32+rsi] + mov r10,rdx + + mul rcx + add r10,rax + mov rax,r14 + adc rdx,0 + mov rsi,QWORD PTR[40+rsi] + mov r11,rdx + + mul rbx + add r11,rax + mov rax,r14 + adc rdx,0 + mov r12,rdx + + mul rbp + add r12,rax + mov rax,r14 + adc rdx,0 + mov r13,rdx + + mul rsi + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + + mul rax + xor r8,r8 + mov QWORD PTR[rdi],rax + mov rax,r15 + add r9,r9 + adc r8,0 + add r9,rdx + adc r8,0 + mov QWORD PTR[8+rdi],r9 + + mul rcx + add r11,rax + mov rax,r15 + adc rdx,0 + mov r9,rdx + + mul rbx + add r12,rax + mov rax,r15 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul rbp + add r13,rax + mov rax,r15 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul rsi + add r14,rax + mov rax,r15 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r15,rdx + + mul rax + xor r9,r9 + add r8,rax + mov rax,rcx + add r10,r10 + adc r11,r11 + adc r9,0 + add r10,r8 + adc r11,rdx + adc r9,0 + mov QWORD PTR[16+rdi],r10 + + mul rbx + add r13,rax + mov rax,rcx + adc rdx,0 + mov QWORD PTR[24+rdi],r11 + mov r8,rdx + + mul rbp + add r14,rax + mov rax,rcx + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul rsi + add r15,rax + mov rax,rcx + adc rdx,0 + add r15,r8 + adc rdx,0 + mov rcx,rdx + + mul rax + xor r11,r11 + add r9,rax + mov rax,rbx + add r12,r12 + adc r13,r13 + adc r11,0 + add r12,r9 + adc r13,rdx + adc r11,0 + mov QWORD PTR[32+rdi],r12 + + + mul rbp + add r15,rax + mov rax,rbx + adc rdx,0 + mov QWORD PTR[40+rdi],r13 + mov r8,rdx + + mul rsi + add rcx,rax + mov rax,rbx + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov rbx,rdx + + mul rax + xor r12,r12 + add r11,rax + mov rax,rbp + add r14,r14 + adc r15,r15 + adc r12,0 + add r14,r11 + adc r15,rdx + mov QWORD PTR[48+rdi],r14 + adc r12,0 + mov QWORD PTR[56+rdi],r15 + + + mul rsi + add rbx,rax + mov rax,rbp + adc rdx,0 + mov rbp,rdx + + mul rax + xor r13,r13 + add r12,rax + mov rax,rsi + add rcx,rcx + adc rbx,rbx + adc r13,0 + add rcx,r12 + adc rbx,rdx + mov QWORD PTR[64+rdi],rcx + adc r13,0 + mov QWORD PTR[72+rdi],rbx + + + mul rax + add rax,r13 + add rbp,rbp + adc rdx,0 + add rax,rbp + adc rdx,0 + mov QWORD PTR[80+rdi],rax + mov QWORD PTR[88+rdi],rdx + + DB 0F3h,0C3h ;repret +__sqrq_384 ENDP + +PUBLIC sqr_mont_384 + + +ALIGN 32 +sqr_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*15 + +$L$SEH_body_sqr_mont_384:: + + + mov QWORD PTR[96+rsp],rcx + mov QWORD PTR[104+rsp],rdx + mov QWORD PTR[112+rsp],rdi + + mov rdi,rsp + call __sqrq_384 + + lea rsi,QWORD PTR[rsp] + mov rcx,QWORD PTR[96+rsp] + mov rbx,QWORD PTR[104+rsp] + mov rdi,QWORD PTR[112+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[120+rsp] + mov r15,QWORD PTR[120+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_384:: +sqr_mont_384 ENDP + + + +PUBLIC redc_mont_384 + + +ALIGN 32 +redc_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redc_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redc_mont_384:: + + + mov rbx,rdx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redc_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redc_mont_384:: +redc_mont_384 ENDP + + + + +PUBLIC from_mont_384 + + +ALIGN 32 +from_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_from_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_from_mont_384:: + + + mov rbx,rdx + call __mulq_by_1_mont_384 + + + + + + mov rcx,r15 + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_from_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_from_mont_384:: +from_mont_384 ENDP + +ALIGN 32 +__mulq_by_1_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,rax + imul rax,rcx + mov r8,rax + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r8 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r15,r9 + imul r9,rcx + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[32+rbx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[40+rbx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r9 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r12,rax + mov rax,r9 + adc rdx,0 + mov r8,r10 + imul r10,rcx + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rbx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rbx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r8,rax + mov rax,r10 + adc r8,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rbx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r10 + adc rdx,0 + mov r9,r11 + imul r11,rcx + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rbx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rbx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[rbx] + add r9,rax + mov rax,r11 + adc r9,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r11 + adc rdx,0 + mov r10,r12 + imul r12,rcx + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rbx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rbx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[rbx] + add r10,rax + mov rax,r12 + adc r10,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r12 + adc rdx,0 + mov r11,r13 + imul r13,rcx + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rbx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rbx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rbx] + add r11,rax + mov rax,r13 + adc r11,rdx + + mul QWORD PTR[8+rbx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rbx] + add r8,rax + mov rax,r13 + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rbx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rbx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + DB 0F3h,0C3h ;repret +__mulq_by_1_mont_384 ENDP + + +ALIGN 32 +__redc_tail_mont_384 PROC PRIVATE + DB 243,15,30,250 + add r14,QWORD PTR[48+rsi] + mov rax,r14 + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + mov rcx,r15 + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + sbb r12,r12 + + + + + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__redc_tail_mont_384 ENDP + +PUBLIC sgn0_pty_mont_384 + + +ALIGN 32 +sgn0_pty_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mont_384:: + + + mov rbx,rsi + lea rsi,QWORD PTR[rdi] + mov rcx,rdx + call __mulq_by_1_mont_384 + + xor rax,rax + mov r13,r14 + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + not rax + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0_pty_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mont_384:: +sgn0_pty_mont_384 ENDP + +PUBLIC sgn0_pty_mont_384x + + +ALIGN 32 +sgn0_pty_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mont_384x:: + + + mov rbx,rsi + lea rsi,QWORD PTR[48+rdi] + mov rcx,rdx + call __mulq_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + lea rsi,QWORD PTR[rdi] + xor rdi,rdi + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rdi,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rdi,0 + + mov QWORD PTR[rsp],r14 + not rdi + and r13,1 + and rdi,2 + or rdi,r13 + + call __mulq_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + xor rax,rax + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + mov r12,QWORD PTR[rsp] + + not rax + + test r14,r14 + cmovz r13,rdi + + test r12,r12 + cmovnz rax,rdi + + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0_pty_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mont_384x:: +sgn0_pty_mont_384x ENDP +PUBLIC mul_mont_384 + + +ALIGN 32 +mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*3 + +$L$SEH_body_mul_mont_384:: + + + mov rax,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + mov rbx,rdx + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + + call __mulq_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_384:: +mul_mont_384 ENDP + +ALIGN 32 +__mulq_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov rdi,rax + mul r14 + mov r8,rax + mov rax,rdi + mov r9,rdx + + mul r15 + add r9,rax + mov rax,rdi + adc rdx,0 + mov r10,rdx + + mul r12 + add r10,rax + mov rax,rdi + adc rdx,0 + mov r11,rdx + + mov rbp,r8 + imul r8,QWORD PTR[8+rsp] + + mul r13 + add r11,rax + mov rax,rdi + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[40+rsi] + add r13,rax + mov rax,r8 + adc rdx,0 + xor r15,r15 + mov r14,rdx + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r8 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r13,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + add r13,rbp + adc r14,rdx + adc r15,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r9,rax + mov rax,rdi + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[8+rsi] + add r10,rax + mov rax,rdi + adc rdx,0 + add r10,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mov rbp,r9 + imul r9,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rsi] + add r14,r8 + adc rdx,0 + xor r8,r8 + add r14,rax + mov rax,r9 + adc r15,rdx + adc r8,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r9 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,r9 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r14,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r14,rbp + adc r15,rdx + adc r8,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,rdi + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + add r11,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mov rbp,r10 + imul r10,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rsi] + add r15,r9 + adc rdx,0 + xor r9,r9 + add r15,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r10 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,r10 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r15,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r15,rbp + adc r8,rdx + adc r9,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mov rbp,r11 + imul r11,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rsi] + add r8,r10 + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r11 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r14,rbp + adc rdx,0 + add r14,rax + mov rax,r11 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r8,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r8,rbp + adc r9,rdx + adc r10,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mov rbp,r12 + imul r12,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rsi] + add r8,rax + mov rax,rdi + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r9,r11 + adc rdx,0 + xor r11,r11 + add r9,rax + mov rax,r12 + adc r10,rdx + adc r11,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r12 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r15,rbp + adc rdx,0 + add r15,rax + mov rax,r12 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r9,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r9,rbp + adc r10,rdx + adc r11,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[8+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[16+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r12 + adc rdx,0 + mov r12,rdx + + mov rbp,r13 + imul r13,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r8,rax + mov rax,rdi + adc rdx,0 + add r8,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r9,rax + mov rax,rdi + adc rdx,0 + add r9,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[40+rsi] + add r10,r12 + adc rdx,0 + xor r12,r12 + add r10,rax + mov rax,r13 + adc r11,rdx + adc r12,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r13 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r8,rbp + adc rdx,0 + add r8,rax + mov rax,r13 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,rbp + adc r11,rdx + adc r12,0 + + + + + mov rdi,QWORD PTR[16+rsp] + sub r14,QWORD PTR[rcx] + mov rdx,r15 + sbb r15,QWORD PTR[8+rcx] + mov rbx,r8 + sbb r8,QWORD PTR[16+rcx] + mov rsi,r9 + sbb r9,QWORD PTR[24+rcx] + mov rbp,r10 + sbb r10,QWORD PTR[32+rcx] + mov r13,r11 + sbb r11,QWORD PTR[40+rcx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rdx + cmovc r8,rbx + mov QWORD PTR[rdi],r14 + cmovc r9,rsi + mov QWORD PTR[8+rdi],r15 + cmovc r10,rbp + mov QWORD PTR[16+rdi],r8 + cmovc r11,r13 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__mulq_mont_384 ENDP +PUBLIC sqr_n_mul_mont_384 + + +ALIGN 32 +sqr_n_mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_n_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*17 + +$L$SEH_body_sqr_n_mul_mont_384:: + + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rcx + lea rdi,QWORD PTR[32+rsp] + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqr_384:: + movd xmm1,edx + + call __sqrq_384 + + lea rsi,QWORD PTR[rdi] + mov rcx,QWORD PTR[rsp] + mov rbx,QWORD PTR[16+rsp] + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd edx,xmm1 + lea rsi,QWORD PTR[rdi] + dec edx + jnz $L$oop_sqr_384 + +DB 102,72,15,126,208 + mov rcx,rbx + mov rbx,QWORD PTR[24+rsp] + + + + + + + mov r12,r8 + mov r13,r9 + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[136+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_n_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_n_mul_mont_384:: +sqr_n_mul_mont_384 ENDP + +PUBLIC sqr_n_mul_mont_383 + + +ALIGN 32 +sqr_n_mul_mont_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_n_mul_mont_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*17 + +$L$SEH_body_sqr_n_mul_mont_383:: + + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rcx + lea rdi,QWORD PTR[32+rsp] + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqr_383:: + movd xmm1,edx + + call __sqrq_384 + + lea rsi,QWORD PTR[rdi] + mov rcx,QWORD PTR[rsp] + mov rbx,QWORD PTR[16+rsp] + call __mulq_by_1_mont_384 + + movd edx,xmm1 + add r14,QWORD PTR[48+rsi] + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + lea rsi,QWORD PTR[rdi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r8 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + dec edx + jnz $L$oop_sqr_383 + +DB 102,72,15,126,208 + mov rcx,rbx + mov rbx,QWORD PTR[24+rsp] + + + + + + + mov r12,r8 + mov r13,r9 + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[136+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_n_mul_mont_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_n_mul_mont_383:: +sqr_n_mul_mont_383 ENDP + +ALIGN 32 +__mulq_mont_383_nonred PROC PRIVATE + DB 243,15,30,250 + mov rbp,rax + mul r14 + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r15 + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r12 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mov r15,r8 + imul r8,QWORD PTR[8+rsp] + + mul r13 + add r11,rax + mov rax,rbp + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[40+rsi] + add r13,rax + mov rax,r8 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rcx] + add r15,rax + mov rax,r8 + adc r15,rdx + + mul QWORD PTR[8+rcx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rcx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rcx] + add r11,r15 + adc rdx,0 + add r11,rax + mov rax,r8 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rcx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rcx] + add r13,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + add r13,r15 + adc r14,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[8+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r11,r15 + adc rdx,0 + mov r15,rdx + + mov r8,r9 + imul r9,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rsi] + add r14,r15 + adc rdx,0 + add r14,rax + mov rax,r9 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rcx] + add r8,rax + mov rax,r9 + adc r8,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[24+rcx] + add r12,r8 + adc rdx,0 + add r12,rax + mov rax,r9 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rcx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rcx] + add r14,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r14,r8 + adc r15,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mov r9,r10 + imul r10,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rsi] + add r15,r8 + adc rdx,0 + add r15,rax + mov rax,r10 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[rcx] + add r9,rax + mov rax,r10 + adc r9,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rcx] + add r13,r9 + adc rdx,0 + add r13,rax + mov rax,r10 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rcx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rcx] + add r15,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r15,r9 + adc r8,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mov r10,r11 + imul r11,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rsi] + add r8,r9 + adc rdx,0 + add r8,rax + mov rax,r11 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[rcx] + add r10,rax + mov rax,r11 + adc r10,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rcx] + add r14,r10 + adc rdx,0 + add r14,rax + mov rax,r11 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rcx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rcx] + add r8,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r8,r10 + adc r9,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mov r11,r12 + imul r12,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add r8,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rsi] + add r9,r10 + adc rdx,0 + add r9,rax + mov rax,r12 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rcx] + add r11,rax + mov rax,r12 + adc r11,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rcx] + add r15,r11 + adc rdx,0 + add r15,rax + mov rax,r12 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rcx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rcx] + add r9,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r9,r11 + adc r10,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[8+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mov r12,r13 + imul r13,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r9,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r10,r11 + adc rdx,0 + add r10,rax + mov rax,r13 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[rcx] + add r12,rax + mov rax,r13 + adc r12,rdx + + mul QWORD PTR[8+rcx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[16+rcx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[24+rcx] + add r8,r12 + adc rdx,0 + add r8,rax + mov rax,r13 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rcx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[40+rcx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r12 + adc r11,rdx + DB 0F3h,0C3h ;repret +__mulq_mont_383_nonred ENDP +PUBLIC sqr_mont_382x + + +ALIGN 32 +sqr_mont_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqr_mont_382x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rsi + mov QWORD PTR[24+rsp],rdi + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rdx,r11 + adc r11,QWORD PTR[72+rsi] + mov rbx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rdx,QWORD PTR[72+rsi] + sbb rbx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rdi,rdi + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + mov QWORD PTR[((32+48))+rsp],r14 + mov QWORD PTR[((32+56))+rsp],r15 + mov QWORD PTR[((32+64))+rsp],rax + mov QWORD PTR[((32+72))+rsp],rdx + mov QWORD PTR[((32+80))+rsp],rbx + mov QWORD PTR[((32+88))+rsp],rbp + mov QWORD PTR[((32+96))+rsp],rdi + + + + lea rbx,QWORD PTR[48+rsi] + + mov rax,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + mov rdi,QWORD PTR[24+rsp] + call __mulq_mont_383_nonred + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],r8 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r10 + mov QWORD PTR[88+rdi],r11 + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rax,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov r12,QWORD PTR[((32+16))+rsp] + mov r13,QWORD PTR[((32+24))+rsp] + + call __mulq_mont_383_nonred + mov rsi,QWORD PTR[((32+96))+rsp] + mov r12,QWORD PTR[((32+0))+rsp] + mov r13,QWORD PTR[((32+8))+rsp] + and r12,rsi + mov rax,QWORD PTR[((32+16))+rsp] + and r13,rsi + mov rbx,QWORD PTR[((32+24))+rsp] + and rax,rsi + mov rbp,QWORD PTR[((32+32))+rsp] + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[((32+40))+rsp] + + sub r14,r12 + mov r12,QWORD PTR[rcx] + sbb r15,r13 + mov r13,QWORD PTR[8+rcx] + sbb r8,rax + mov rax,QWORD PTR[16+rcx] + sbb r9,rbx + mov rbx,QWORD PTR[24+rcx] + sbb r10,rbp + mov rbp,QWORD PTR[32+rcx] + sbb r11,rsi + sbb rsi,rsi + + and r12,rsi + and r13,rsi + and rax,rsi + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[40+rcx] + + add r14,r12 + adc r15,r13 + adc r8,rax + adc r9,rbx + adc r10,rbp + adc r11,rsi + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r8 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_382x:: +sqr_mont_382x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mul_mont_384x + DD imagerel $L$SEH_body_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_prologue + + DD imagerel $L$SEH_body_mul_mont_384x + DD imagerel $L$SEH_epilogue_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_body + + DD imagerel $L$SEH_epilogue_mul_mont_384x + DD imagerel $L$SEH_end_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_384x + DD imagerel $L$SEH_body_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_prologue + + DD imagerel $L$SEH_body_sqr_mont_384x + DD imagerel $L$SEH_epilogue_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_body + + DD imagerel $L$SEH_epilogue_sqr_mont_384x + DD imagerel $L$SEH_end_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mul_382x + DD imagerel $L$SEH_body_mul_382x + DD imagerel $L$SEH_info_mul_382x_prologue + + DD imagerel $L$SEH_body_mul_382x + DD imagerel $L$SEH_epilogue_mul_382x + DD imagerel $L$SEH_info_mul_382x_body + + DD imagerel $L$SEH_epilogue_mul_382x + DD imagerel $L$SEH_end_mul_382x + DD imagerel $L$SEH_info_mul_382x_epilogue + + DD imagerel $L$SEH_begin_sqr_382x + DD imagerel $L$SEH_body_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_prologue + + DD imagerel $L$SEH_body_sqr_382x + DD imagerel $L$SEH_epilogue_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_body + + DD imagerel $L$SEH_epilogue_sqr_382x + DD imagerel $L$SEH_end_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_epilogue + + DD imagerel $L$SEH_begin_mul_384 + DD imagerel $L$SEH_body_mul_384 + DD imagerel $L$SEH_info_mul_384_prologue + + DD imagerel $L$SEH_body_mul_384 + DD imagerel $L$SEH_epilogue_mul_384 + DD imagerel $L$SEH_info_mul_384_body + + DD imagerel $L$SEH_epilogue_mul_384 + DD imagerel $L$SEH_end_mul_384 + DD imagerel $L$SEH_info_mul_384_epilogue + + DD imagerel $L$SEH_begin_sqr_384 + DD imagerel $L$SEH_body_sqr_384 + DD imagerel $L$SEH_info_sqr_384_prologue + + DD imagerel $L$SEH_body_sqr_384 + DD imagerel $L$SEH_epilogue_sqr_384 + DD imagerel $L$SEH_info_sqr_384_body + + DD imagerel $L$SEH_epilogue_sqr_384 + DD imagerel $L$SEH_end_sqr_384 + DD imagerel $L$SEH_info_sqr_384_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_384 + DD imagerel $L$SEH_body_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_prologue + + DD imagerel $L$SEH_body_sqr_mont_384 + DD imagerel $L$SEH_epilogue_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_body + + DD imagerel $L$SEH_epilogue_sqr_mont_384 + DD imagerel $L$SEH_end_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_epilogue + + DD imagerel $L$SEH_begin_redc_mont_384 + DD imagerel $L$SEH_body_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_prologue + + DD imagerel $L$SEH_body_redc_mont_384 + DD imagerel $L$SEH_epilogue_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_body + + DD imagerel $L$SEH_epilogue_redc_mont_384 + DD imagerel $L$SEH_end_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_epilogue + + DD imagerel $L$SEH_begin_from_mont_384 + DD imagerel $L$SEH_body_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_prologue + + DD imagerel $L$SEH_body_from_mont_384 + DD imagerel $L$SEH_epilogue_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_body + + DD imagerel $L$SEH_epilogue_from_mont_384 + DD imagerel $L$SEH_end_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mont_384 + DD imagerel $L$SEH_body_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mont_384 + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 + DD imagerel $L$SEH_end_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mont_384x + DD imagerel $L$SEH_body_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mont_384x + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x + DD imagerel $L$SEH_end_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mul_mont_384 + DD imagerel $L$SEH_body_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_prologue + + DD imagerel $L$SEH_body_mul_mont_384 + DD imagerel $L$SEH_epilogue_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_mul_mont_384 + DD imagerel $L$SEH_end_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqr_n_mul_mont_384 + DD imagerel $L$SEH_body_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_prologue + + DD imagerel $L$SEH_body_sqr_n_mul_mont_384 + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 + DD imagerel $L$SEH_end_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqr_n_mul_mont_383 + DD imagerel $L$SEH_body_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_prologue + + DD imagerel $L$SEH_body_sqr_n_mul_mont_383 + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_body + + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 + DD imagerel $L$SEH_end_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_382x + DD imagerel $L$SEH_body_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_prologue + + DD imagerel $L$SEH_body_sqr_mont_382x + DD imagerel $L$SEH_epilogue_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_body + + DD imagerel $L$SEH_epilogue_sqr_mont_382x + DD imagerel $L$SEH_end_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mul_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,029h,000h +DB 000h,0e4h,02ah,000h +DB 000h,0d4h,02bh,000h +DB 000h,0c4h,02ch,000h +DB 000h,034h,02dh,000h +DB 000h,054h,02eh,000h +DB 000h,074h,030h,000h +DB 000h,064h,031h,000h +DB 000h,001h,02fh,000h +$L$SEH_info_mul_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_mul_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_382x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_384_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_mul_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqr_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,00fh,000h +DB 000h,0e4h,010h,000h +DB 000h,0d4h,011h,000h +DB 000h,0c4h,012h,000h +DB 000h,034h,013h,000h +DB 000h,054h,014h,000h +DB 000h,074h,016h,000h +DB 000h,064h,017h,000h +DB 000h,001h,015h,000h +$L$SEH_info_sqr_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redc_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redc_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redc_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_from_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_from_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_from_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0_pty_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0_pty_mont_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0_pty_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mul_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_n_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_n_mul_mont_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_n_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_n_mul_mont_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_n_mul_mont_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_n_mul_mont_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqr_mont_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqr_mont_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm new file mode 100644 index 00000000000..83534c629e9 --- /dev/null +++ b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm @@ -0,0 +1,796 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC mulx_mont_sparse_256 + + +ALIGN 32 +mulx_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mulx_mont_sparse_256:: + + + mov rbx,rdx + mov rdx,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbp,QWORD PTR[16+rsi] + mov r9,QWORD PTR[24+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r11,rax,r14 + call __mulx_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mulx_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_sparse_256:: +mulx_mont_sparse_256 ENDP + +PUBLIC sqrx_mont_sparse_256 + + +ALIGN 32 +sqrx_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_sparse_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sqrx_mont_sparse_256:: + + + mov rbx,rsi + mov r8,rcx + mov rcx,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbp,QWORD PTR[16+rsi] + mov r9,QWORD PTR[24+rsi] + lea rsi,QWORD PTR[((-128))+rbx] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r11,rax,rdx + call __mulx_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_sparse_256:: +sqrx_mont_sparse_256 ENDP + +ALIGN 32 +__mulx_mont_sparse_256 PROC PRIVATE + DB 243,15,30,250 + mulx r12,r15,r15 + mulx r13,rbp,rbp + add r11,r15 + mulx r14,r9,r9 + mov rdx,QWORD PTR[8+rbx] + adc r12,rbp + adc r13,r9 + adc r14,0 + + mov r10,rax + imul rax,r8 + + + xor r15,r15 + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r11,rbp + adcx r12,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r12,rbp + adcx r13,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r14,rbp + adcx r9,r15 + adox r15,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r10,rbp + adox rax,r11 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r12,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r12,rbp + adox r13,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rbp + adox r14,r9 + adcx r14,r10 + adox r15,r10 + adcx r15,r10 + adox r10,r10 + adc r10,0 + mov r11,rax + imul rax,r8 + + + xor rbp,rbp + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r12,rbp + adcx r13,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r14,rbp + adcx r15,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r15,rbp + adcx r9,r10 + adox r10,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r11,rbp + adox rax,r12 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r13,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r13,rbp + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rbp + adox r15,r9 + adcx r15,r11 + adox r10,r11 + adcx r10,r11 + adox r11,r11 + adc r11,0 + mov r12,rax + imul rax,r8 + + + xor rbp,rbp + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r14,rbp + adcx r15,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r15,rbp + adcx r10,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r10,rbp + adcx r9,r11 + adox r11,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r12,rbp + adox rax,r13 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r14,rbp + adox r15,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,rax + adcx r15,rbp + adox r10,r9 + adcx r10,r12 + adox r11,r12 + adcx r11,r12 + adox r12,r12 + adc r12,0 + imul rdx,r8 + + + xor rbp,rbp + mulx r9,r13,QWORD PTR[((0+128))+rcx] + adcx r13,rax + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx r14,rbp + adox r15,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r15,rbp + adox r10,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,r14 + lea rcx,QWORD PTR[128+rcx] + adcx r10,rbp + adox r11,r9 + mov rax,r15 + adcx r11,r13 + adox r12,r13 + adc r12,0 + + + + + mov rbp,r10 + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov r9,r11 + sbb r11,QWORD PTR[24+rcx] + sbb r12,0 + + cmovc r14,rdx + cmovc r15,rax + cmovc r10,rbp + mov QWORD PTR[rdi],r14 + cmovc r11,r9 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + DB 0F3h,0C3h ;repret +__mulx_mont_sparse_256 ENDP +PUBLIC fromx_mont_256 + + +ALIGN 32 +fromx_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_fromx_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_fromx_mont_256:: + + + mov rbx,rdx + call __mulx_by_1_mont_256 + + + + + + mov rdx,r15 + mov r12,r10 + mov r13,r11 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r10,QWORD PTR[16+rbx] + sbb r11,QWORD PTR[24+rbx] + + cmovnc rax,r14 + cmovnc rdx,r15 + cmovnc r12,r10 + mov QWORD PTR[rdi],rax + cmovnc r13,r11 + mov QWORD PTR[8+rdi],rdx + mov QWORD PTR[16+rdi],r12 + mov QWORD PTR[24+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_fromx_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_fromx_mont_256:: +fromx_mont_256 ENDP + +PUBLIC redcx_mont_256 + + +ALIGN 32 +redcx_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redcx_mont_256:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redcx_mont_256:: + + + mov rbx,rdx + call __mulx_by_1_mont_256 + + add r14,QWORD PTR[32+rsi] + adc r15,QWORD PTR[40+rsi] + mov rax,r14 + adc r10,QWORD PTR[48+rsi] + mov rdx,r15 + adc r11,QWORD PTR[56+rsi] + sbb rsi,rsi + + + + + mov r12,r10 + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r10,QWORD PTR[16+rbx] + mov r13,r11 + sbb r11,QWORD PTR[24+rbx] + sbb rsi,0 + + cmovnc rax,r14 + cmovnc rdx,r15 + cmovnc r12,r10 + mov QWORD PTR[rdi],rax + cmovnc r13,r11 + mov QWORD PTR[8+rdi],rdx + mov QWORD PTR[16+rdi],r12 + mov QWORD PTR[24+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redcx_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redcx_mont_256:: +redcx_mont_256 ENDP + +ALIGN 32 +__mulx_by_1_mont_256 PROC PRIVATE + DB 243,15,30,250 + mov rax,QWORD PTR[rsi] + mov r11,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + mov r14,rax + imul rax,rcx + mov r10,rax + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r10 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + mov r15,r11 + imul r11,rcx + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r11 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + mov r10,r12 + imul r12,rcx + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r10,rax + mov rax,r12 + adc r10,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rbx] + mov r11,r13 + imul r13,rcx + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rbx] + add r11,rax + mov rax,r13 + adc r11,rdx + + mul QWORD PTR[8+rbx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rbx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + DB 0F3h,0C3h ;repret +__mulx_by_1_mont_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mulx_mont_sparse_256 + DD imagerel $L$SEH_body_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_mulx_mont_sparse_256 + DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 + DD imagerel $L$SEH_end_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_sparse_256 + DD imagerel $L$SEH_body_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_sqrx_mont_sparse_256 + DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 + DD imagerel $L$SEH_end_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_fromx_mont_256 + DD imagerel $L$SEH_body_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_prologue + + DD imagerel $L$SEH_body_fromx_mont_256 + DD imagerel $L$SEH_epilogue_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_body + + DD imagerel $L$SEH_epilogue_fromx_mont_256 + DD imagerel $L$SEH_end_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_epilogue + + DD imagerel $L$SEH_begin_redcx_mont_256 + DD imagerel $L$SEH_body_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_prologue + + DD imagerel $L$SEH_body_redcx_mont_256 + DD imagerel $L$SEH_epilogue_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_body + + DD imagerel $L$SEH_epilogue_redcx_mont_256 + DD imagerel $L$SEH_end_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mulx_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_mulx_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_fromx_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_fromx_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_fromx_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redcx_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redcx_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redcx_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm new file mode 100644 index 00000000000..25bee97731b --- /dev/null +++ b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm @@ -0,0 +1,3586 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + + + + + + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__sub_mod_384_a_is_loaded:: + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP +PUBLIC mulx_mont_384x + + +ALIGN 32 +mulx_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,328 + +$L$SEH_body_mulx_mont_384x:: + + + mov rbx,rdx + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[24+rsp],rsi + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[8+rsp],rcx + mov QWORD PTR[rsp],r8 + + + + + lea rdi,QWORD PTR[40+rsp] + call __mulx_384 + + + lea rbx,QWORD PTR[48+rbx] + lea rsi,QWORD PTR[((128+48))+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulx_384 + + + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[rbx] + lea rdx,QWORD PTR[((-48))+rbx] + lea rdi,QWORD PTR[((40+192+48))+rsp] + call __add_mod_384 + + mov rsi,QWORD PTR[24+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + lea rbx,QWORD PTR[rdi] + lea rsi,QWORD PTR[48+rdi] + call __mulx_384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[8+rsp] + call __sub_mod_384x384 + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[40+rsp] + lea rdx,QWORD PTR[((40+96))+rsp] + lea rdi,QWORD PTR[40+rsp] + call __sub_mod_384x384 + + lea rbx,QWORD PTR[rcx] + + + lea rsi,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[rsp] + mov rdi,QWORD PTR[32+rsp] + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + lea rsi,QWORD PTR[((40+192))+rsp] + mov rcx,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + lea r8,QWORD PTR[328+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mulx_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_384x:: +mulx_mont_384x ENDP +PUBLIC sqrx_mont_384x + + +ALIGN 32 +sqrx_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqrx_mont_384x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rsi + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[32+rsp] + call __add_mod_384 + + + mov rsi,QWORD PTR[24+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((32+48))+rsp] + call __sub_mod_384 + + + mov rsi,QWORD PTR[24+rsp] + lea rbx,QWORD PTR[48+rsi] + + mov rdx,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + add rdx,rdx + adc r15,r15 + adc rax,rax + mov r8,rdx + adc r12,r12 + mov r9,r15 + adc rdi,rdi + mov r10,rax + adc rbp,rbp + mov r11,r12 + sbb rsi,rsi + + sub rdx,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov r13,rdi + sbb rax,QWORD PTR[16+rcx] + sbb r12,QWORD PTR[24+rcx] + sbb rdi,QWORD PTR[32+rcx] + mov r14,rbp + sbb rbp,QWORD PTR[40+rcx] + sbb rsi,0 + + cmovc rdx,r8 + cmovc r15,r9 + cmovc rax,r10 + mov QWORD PTR[48+rbx],rdx + cmovc r12,r11 + mov QWORD PTR[56+rbx],r15 + cmovc rdi,r13 + mov QWORD PTR[64+rbx],rax + cmovc rbp,r14 + mov QWORD PTR[72+rbx],r12 + mov QWORD PTR[80+rbx],rdi + mov QWORD PTR[88+rbx],rbp + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rdx,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov rax,QWORD PTR[((32+16))+rsp] + mov r12,QWORD PTR[((32+24))+rsp] + mov rdi,QWORD PTR[((32+32))+rsp] + mov rbp,QWORD PTR[((32+40))+rsp] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqrx_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_384x:: +sqrx_mont_384x ENDP + +PUBLIC mulx_382x + + +ALIGN 32 +mulx_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_mulx_382x:: + + + lea rdi,QWORD PTR[96+rdi] + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rcx + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[48+rsi] + adc r9,QWORD PTR[56+rsi] + adc r10,QWORD PTR[64+rsi] + adc r11,QWORD PTR[72+rsi] + adc r12,QWORD PTR[80+rsi] + adc r13,QWORD PTR[88+rsi] + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + add r8,QWORD PTR[48+rdx] + adc r9,QWORD PTR[56+rdx] + adc r10,QWORD PTR[64+rdx] + adc r11,QWORD PTR[72+rdx] + adc r12,QWORD PTR[80+rdx] + adc r13,QWORD PTR[88+rdx] + + mov QWORD PTR[((32+48))+rsp],r8 + mov QWORD PTR[((32+56))+rsp],r9 + mov QWORD PTR[((32+64))+rsp],r10 + mov QWORD PTR[((32+72))+rsp],r11 + mov QWORD PTR[((32+80))+rsp],r12 + mov QWORD PTR[((32+88))+rsp],r13 + + + lea rsi,QWORD PTR[((32+0))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + call __mulx_384 + + + mov rsi,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __mulx_384 + + + lea rsi,QWORD PTR[((48+128))+rsi] + lea rbx,QWORD PTR[48+rbx] + lea rdi,QWORD PTR[32+rsp] + call __mulx_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[32+rsp] + mov rcx,QWORD PTR[24+rsp] + mov rdi,rsi + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + + lea rsi,QWORD PTR[((-96))+rdi] + lea rdx,QWORD PTR[32+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __sub_mod_384x384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mulx_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_382x:: +mulx_382x ENDP +PUBLIC sqrx_382x + + +ALIGN 32 +sqrx_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_sqrx_382x:: + + + mov rcx,rdx + + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rdx,QWORD PTR[40+rsi] + + mov r8,r14 + add r14,QWORD PTR[48+rsi] + mov r9,r15 + adc r15,QWORD PTR[56+rsi] + mov r10,rax + adc rax,QWORD PTR[64+rsi] + mov r11,rbx + adc rbx,QWORD PTR[72+rsi] + mov r12,rbp + adc rbp,QWORD PTR[80+rsi] + mov r13,rdx + adc rdx,QWORD PTR[88+rsi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],rax + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rdx + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384_a_is_loaded + + + lea rsi,QWORD PTR[rdi] + lea rbx,QWORD PTR[((-48))+rdi] + lea rdi,QWORD PTR[((-48))+rdi] + call __mulx_384 + + + mov rsi,QWORD PTR[rsp] + lea rbx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulx_384 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + mov r14,QWORD PTR[48+rdi] + mov r15,QWORD PTR[56+rdi] + mov rax,QWORD PTR[64+rdi] + mov rbx,QWORD PTR[72+rdi] + mov rbp,QWORD PTR[80+rdi] + add r8,r8 + mov rdx,QWORD PTR[88+rdi] + adc r9,r9 + mov QWORD PTR[rdi],r8 + adc r10,r10 + mov QWORD PTR[8+rdi],r9 + adc r11,r11 + mov QWORD PTR[16+rdi],r10 + adc r12,r12 + mov QWORD PTR[24+rdi],r11 + adc r13,r13 + mov QWORD PTR[32+rdi],r12 + adc r14,r14 + mov QWORD PTR[40+rdi],r13 + adc r15,r15 + mov QWORD PTR[48+rdi],r14 + adc rax,rax + mov QWORD PTR[56+rdi],r15 + adc rbx,rbx + mov QWORD PTR[64+rdi],rax + adc rbp,rbp + mov QWORD PTR[72+rdi],rbx + adc rdx,rdx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rdx + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_382x:: +sqrx_382x ENDP +PUBLIC mulx_384 + + +ALIGN 32 +mulx_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$SEH_body_mulx_384:: + + + mov rbx,rdx + call __mulx_384 + + mov r15,QWORD PTR[rsp] + + mov r14,QWORD PTR[8+rsp] + + mov r13,QWORD PTR[16+rsp] + + mov r12,QWORD PTR[24+rsp] + + mov rbx,QWORD PTR[32+rsp] + + mov rbp,QWORD PTR[40+rsp] + + lea rsp,QWORD PTR[48+rsp] + +$L$SEH_epilogue_mulx_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_384:: +mulx_384 ENDP + + +ALIGN 32 +__mulx_384 PROC PRIVATE + DB 243,15,30,250 + mov rdx,QWORD PTR[rbx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + + mulx rcx,r9,r14 + xor rbp,rbp + + mulx rax,r8,r15 + adcx r8,rcx + mov QWORD PTR[rdi],r9 + + mulx rcx,r9,r10 + adcx r9,rax + + mulx rax,r10,r11 + adcx r10,rcx + + mulx rcx,r11,r12 + adcx r11,rax + + mulx r13,r12,r13 + mov rdx,QWORD PTR[8+rbx] + adcx r12,rcx + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[8+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[16+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[16+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[24+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[24+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[32+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[32+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[40+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[40+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,rax + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + mov QWORD PTR[64+rdi],r10 + mov QWORD PTR[72+rdi],r11 + mov QWORD PTR[80+rdi],r12 + mov QWORD PTR[88+rdi],r13 + + DB 0F3h,0C3h ;repret +__mulx_384 ENDP +PUBLIC sqrx_384 + + +ALIGN 32 +sqrx_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_384:: + mov rdi,rcx + mov rsi,rdx + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_sqrx_384:: + + + call __sqrx_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_384:: +sqrx_384 ENDP + +ALIGN 32 +__sqrx_384 PROC PRIVATE + DB 243,15,30,250 + mov rdx,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r15,QWORD PTR[16+rsi] + mov rcx,QWORD PTR[24+rsi] + mov rbx,QWORD PTR[32+rsi] + + + mulx rdi,r8,r14 + mov rbp,QWORD PTR[40+rsi] + mulx rax,r9,r15 + add r9,rdi + mulx rdi,r10,rcx + adc r10,rax + mulx rax,r11,rbx + adc r11,rdi + mulx r13,r12,rbp + mov rdx,r14 + adc r12,rax + adc r13,0 + + + xor r14,r14 + mulx rax,rdi,r15 + adcx r10,rdi + adox r11,rax + + mulx rax,rdi,rcx + adcx r11,rdi + adox r12,rax + + mulx rax,rdi,rbx + adcx r12,rdi + adox r13,rax + + mulx rax,rdi,rbp + mov rdx,r15 + adcx r13,rdi + adox rax,r14 + adcx r14,rax + + + xor r15,r15 + mulx rax,rdi,rcx + adcx r12,rdi + adox r13,rax + + mulx rax,rdi,rbx + adcx r13,rdi + adox r14,rax + + mulx rax,rdi,rbp + mov rdx,rcx + adcx r14,rdi + adox rax,r15 + adcx r15,rax + + + xor rcx,rcx + mulx rax,rdi,rbx + adcx r14,rdi + adox r15,rax + + mulx rax,rdi,rbp + mov rdx,rbx + adcx r15,rdi + adox rax,rcx + adcx rcx,rax + + + mulx rbx,rdi,rbp + mov rdx,QWORD PTR[rsi] + add rcx,rdi + mov rdi,QWORD PTR[8+rsp] + adc rbx,0 + + + xor rbp,rbp + adcx r8,r8 + adcx r9,r9 + adcx r10,r10 + adcx r11,r11 + adcx r12,r12 + + + mulx rax,rdx,rdx + mov QWORD PTR[rdi],rdx + mov rdx,QWORD PTR[8+rsi] + adox r8,rax + mov QWORD PTR[8+rdi],r8 + + mulx rax,r8,rdx + mov rdx,QWORD PTR[16+rsi] + adox r9,r8 + adox r10,rax + mov QWORD PTR[16+rdi],r9 + mov QWORD PTR[24+rdi],r10 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[24+rsi] + adox r11,r8 + adox r12,r9 + adcx r13,r13 + adcx r14,r14 + mov QWORD PTR[32+rdi],r11 + mov QWORD PTR[40+rdi],r12 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[32+rsi] + adox r13,r8 + adox r14,r9 + adcx r15,r15 + adcx rcx,rcx + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r14 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[40+rsi] + adox r15,r8 + adox rcx,r9 + adcx rbx,rbx + adcx rbp,rbp + mov QWORD PTR[64+rdi],r15 + mov QWORD PTR[72+rdi],rcx + + mulx r9,r8,rdx + adox rbx,r8 + adox rbp,r9 + + mov QWORD PTR[80+rdi],rbx + mov QWORD PTR[88+rdi],rbp + + DB 0F3h,0C3h ;repret +__sqrx_384 ENDP + + + +PUBLIC redcx_mont_384 + + +ALIGN 32 +redcx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redcx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redcx_mont_384:: + + + mov rbx,rdx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redcx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redcx_mont_384:: +redcx_mont_384 ENDP + + + + +PUBLIC fromx_mont_384 + + +ALIGN 32 +fromx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_fromx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_fromx_mont_384:: + + + mov rbx,rdx + call __mulx_by_1_mont_384 + + + + + mov rax,r14 + mov rcx,r15 + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_fromx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_fromx_mont_384:: +fromx_mont_384 ENDP + +ALIGN 32 +__mulx_by_1_mont_384 PROC PRIVATE + DB 243,15,30,250 + mov r8,QWORD PTR[rsi] + mov rdx,rcx + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + imul rdx,r8 + + + xor r14,r14 + mulx rbp,rax,QWORD PTR[rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r13,rax + adox rbp,r14 + adcx r14,rbp + imul rdx,r9 + + + xor r15,r15 + mulx rbp,rax,QWORD PTR[rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r14,rax + adox rbp,r15 + adcx r15,rbp + imul rdx,r10 + + + xor r8,r8 + mulx rbp,rax,QWORD PTR[rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r15,rax + adox rbp,r8 + adcx r8,rbp + imul rdx,r11 + + + xor r9,r9 + mulx rbp,rax,QWORD PTR[rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r8,rax + adox rbp,r9 + adcx r9,rbp + imul rdx,r12 + + + xor r10,r10 + mulx rbp,rax,QWORD PTR[rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r9,rax + adox rbp,r10 + adcx r10,rbp + imul rdx,r13 + + + xor r11,r11 + mulx rbp,rax,QWORD PTR[rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r10,rax + adox rbp,r11 + adcx r11,rbp + DB 0F3h,0C3h ;repret +__mulx_by_1_mont_384 ENDP + + +ALIGN 32 +__redc_tail_mont_384 PROC PRIVATE + DB 243,15,30,250 + add r14,QWORD PTR[48+rsi] + mov rax,r14 + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + mov rcx,r15 + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + sbb r12,r12 + + + + + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__redc_tail_mont_384 ENDP + +PUBLIC sgn0x_pty_mont_384 + + +ALIGN 32 +sgn0x_pty_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0x_pty_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0x_pty_mont_384:: + + + mov rbx,rsi + lea rsi,QWORD PTR[rdi] + mov rcx,rdx + call __mulx_by_1_mont_384 + + xor rax,rax + mov r13,r14 + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + not rax + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0x_pty_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0x_pty_mont_384:: +sgn0x_pty_mont_384 ENDP + +PUBLIC sgn0x_pty_mont_384x + + +ALIGN 32 +sgn0x_pty_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0x_pty_mont_384x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0x_pty_mont_384x:: + + + mov rbx,rsi + lea rsi,QWORD PTR[48+rdi] + mov rcx,rdx + call __mulx_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + lea rsi,QWORD PTR[rdi] + xor rdi,rdi + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rdi,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rdi,0 + + mov QWORD PTR[rsp],r14 + not rdi + and r13,1 + and rdi,2 + or rdi,r13 + + call __mulx_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + xor rax,rax + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + mov r12,QWORD PTR[rsp] + + not rax + + test r14,r14 + cmovz r13,rdi + + test r12,r12 + cmovnz rax,rdi + + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0x_pty_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0x_pty_mont_384x:: +sgn0x_pty_mont_384x ENDP +PUBLIC mulx_mont_384 + + +ALIGN 32 +mulx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_mulx_mont_384:: + + + mov rbx,rdx + mov rdx,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + mov QWORD PTR[rsp],r8 + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_mulx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_384:: +mulx_mont_384 ENDP + +ALIGN 32 +__mulx_mont_384 PROC PRIVATE + DB 243,15,30,250 + + mulx r10,r14,r15 + mulx r11,r15,rax + add r9,r14 + mulx r12,rax,r12 + adc r10,r15 + mulx r13,rdi,rdi + adc r11,rax + mulx r14,rbp,rbp + mov rdx,QWORD PTR[8+rbx] + adc r12,rdi + adc r13,rbp + adc r14,0 + xor r15,r15 + + mov QWORD PTR[16+rsp],r8 + imul r8,QWORD PTR[8+rsp] + + + xor rax,rax + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r9,rdi + adcx r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r8 + adox r14,rdi + adcx r15,rbp + adox r15,rax + adox rax,rax + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r9,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rdi + adox r14,rbp + adcx r14,r8 + adox r15,r8 + adcx r15,r8 + adox rax,r8 + adcx rax,r8 + mov QWORD PTR[16+rsp],r9 + imul r9,QWORD PTR[8+rsp] + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r9 + adox r15,rdi + adcx rax,rbp + adox rax,r8 + adox r8,r8 + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rdi + adox r15,rbp + adcx r15,r9 + adox rax,r9 + adcx rax,r9 + adox r8,r9 + adcx r8,r9 + mov QWORD PTR[16+rsp],r10 + imul r10,QWORD PTR[8+rsp] + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r10 + adox rax,rdi + adcx r8,rbp + adox r8,r9 + adox r9,r9 + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[32+rbx] + adcx r15,rdi + adox rax,rbp + adcx rax,r10 + adox r8,r10 + adcx r8,r10 + adox r9,r10 + adcx r9,r10 + mov QWORD PTR[16+rsp],r11 + imul r11,QWORD PTR[8+rsp] + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r11 + adox r8,rdi + adcx r9,rbp + adox r9,r10 + adox r10,r10 + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[40+rbx] + adcx rax,rdi + adox r8,rbp + adcx r8,r11 + adox r9,r11 + adcx r9,r11 + adox r10,r11 + adcx r10,r11 + mov QWORD PTR[16+rsp],r12 + imul r12,QWORD PTR[8+rsp] + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r8,rdi + adcx r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r12 + adox r9,rdi + adcx r10,rbp + adox r10,r11 + adox r11,r11 + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r13 + adcx r8,rdi + adox r9,rbp + adcx r9,r12 + adox r10,r12 + adcx r10,r12 + adox r11,r12 + adcx r11,r12 + imul rdx,QWORD PTR[8+rsp] + mov rbx,QWORD PTR[24+rsp] + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx rax,rdi + adox r8,rbp + mov r13,r15 + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r8,rdi + adox r9,rbp + mov rsi,rax + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + adcx r9,rdi + adox r10,rbp + mov rdx,r14 + adcx r10,r12 + adox r11,r12 + lea rcx,QWORD PTR[128+rcx] + mov r12,r8 + adc r11,0 + + + + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov rdi,r9 + sbb rax,QWORD PTR[16+rcx] + sbb r8,QWORD PTR[24+rcx] + sbb r9,QWORD PTR[32+rcx] + mov rbp,r10 + sbb r10,QWORD PTR[40+rcx] + sbb r11,0 + + cmovnc rdx,r14 + cmovc r15,r13 + cmovc rax,rsi + cmovnc r12,r8 + mov QWORD PTR[rbx],rdx + cmovnc rdi,r9 + mov QWORD PTR[8+rbx],r15 + cmovnc rbp,r10 + mov QWORD PTR[16+rbx],rax + mov QWORD PTR[24+rbx],r12 + mov QWORD PTR[32+rbx],rdi + mov QWORD PTR[40+rbx],rbp + + DB 0F3h,0C3h ;repret + +__mulx_mont_384 ENDP +PUBLIC sqrx_mont_384 + + +ALIGN 32 +sqrx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_sqrx_mont_384:: + + + mov r8,rcx + lea rcx,QWORD PTR[((-128))+rdx] + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + lea rbx,QWORD PTR[rsi] + mov QWORD PTR[rsp],r8 + lea rsi,QWORD PTR[((-128))+rsi] + + mulx r9,r8,rdx + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_sqrx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_384:: +sqrx_mont_384 ENDP + +PUBLIC sqrx_n_mul_mont_384 + + +ALIGN 32 +sqrx_n_mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_n_mul_mont_384:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-40))+rsp] + +$L$SEH_body_sqrx_n_mul_mont_384:: + + + mov r10,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,rsi + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqrx_384:: + movd xmm1,r10d + lea rsi,QWORD PTR[((-128))+rbx] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,rdx + call __mulx_mont_384 + + movd r10d,xmm1 + dec r10d + jnz $L$oop_sqrx_384 + + mov r14,rdx +DB 102,72,15,126,210 + lea rsi,QWORD PTR[((-128))+rbx] + mov rbx,QWORD PTR[24+rsp] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[40+rsp] + + mov r14,QWORD PTR[48+rsp] + + mov r13,QWORD PTR[56+rsp] + + mov r12,QWORD PTR[64+rsp] + + mov rbx,QWORD PTR[72+rsp] + + mov rbp,QWORD PTR[80+rsp] + + lea rsp,QWORD PTR[88+rsp] + +$L$SEH_epilogue_sqrx_n_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_n_mul_mont_384:: +sqrx_n_mul_mont_384 ENDP + +PUBLIC sqrx_n_mul_mont_383 + + +ALIGN 32 +sqrx_n_mul_mont_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_n_mul_mont_383:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-40))+rsp] + +$L$SEH_body_sqrx_n_mul_mont_383:: + + + mov r10,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,rsi + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + lea rcx,QWORD PTR[((-128))+rcx] + +$L$oop_sqrx_383:: + movd xmm1,r10d + lea rsi,QWORD PTR[((-128))+rbx] + + mulx r9,r8,rdx + call __mulx_mont_383_nonred + + movd r10d,xmm1 + dec r10d + jnz $L$oop_sqrx_383 + + mov r14,rdx +DB 102,72,15,126,210 + lea rsi,QWORD PTR[((-128))+rbx] + mov rbx,QWORD PTR[24+rsp] + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[40+rsp] + + mov r14,QWORD PTR[48+rsp] + + mov r13,QWORD PTR[56+rsp] + + mov r12,QWORD PTR[64+rsp] + + mov rbx,QWORD PTR[72+rsp] + + mov rbp,QWORD PTR[80+rsp] + + lea rsp,QWORD PTR[88+rsp] + +$L$SEH_epilogue_sqrx_n_mul_mont_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_n_mul_mont_383:: +sqrx_n_mul_mont_383 ENDP + +ALIGN 32 +__mulx_mont_383_nonred PROC PRIVATE + DB 243,15,30,250 + + mulx r10,r14,r15 + mulx r11,r15,rax + add r9,r14 + mulx r12,rax,r12 + adc r10,r15 + mulx r13,rdi,rdi + adc r11,rax + mulx r14,rbp,rbp + mov rdx,QWORD PTR[8+rbx] + adc r12,rdi + adc r13,rbp + adc r14,0 + mov rax,r8 + imul r8,QWORD PTR[8+rsp] + + + xor r15,r15 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r9,rdi + adcx r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r8 + adox r14,rdi + adcx rbp,r15 + adox r15,rbp + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rax,rdi + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r9,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rdi + adox r14,rbp + adcx r14,rax + adox r15,rax + adcx r15,rax + mov r8,r9 + imul r9,QWORD PTR[8+rsp] + + + xor rax,rax + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r9 + adox r15,rdi + adcx rbp,rax + adox rax,rbp + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r8,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rdi + adox r15,rbp + adcx r15,r8 + adox rax,r8 + adcx rax,r8 + mov r9,r10 + imul r10,QWORD PTR[8+rsp] + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r10 + adox rax,rdi + adcx rbp,r8 + adox r8,rbp + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r9,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[32+rbx] + adcx r15,rdi + adox rax,rbp + adcx rax,r9 + adox r8,r9 + adcx r8,r9 + mov r10,r11 + imul r11,QWORD PTR[8+rsp] + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r11 + adox r8,rdi + adcx rbp,r9 + adox r9,rbp + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r10,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[40+rbx] + adcx rax,rdi + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adcx r9,r10 + mov r11,r12 + imul r12,QWORD PTR[8+rsp] + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r8,rdi + adcx r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r12 + adox r9,rdi + adcx rbp,r10 + adox r10,rbp + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r11,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r13 + adcx r8,rdi + adox r9,rbp + adcx r9,r11 + adox r10,r11 + adcx r10,r11 + imul rdx,QWORD PTR[8+rsp] + mov rbx,QWORD PTR[24+rsp] + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r8,rdi + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r14 + adcx r9,rdi + adox r10,rbp + adc r10,0 + mov r12,r8 + + mov QWORD PTR[rbx],r14 + mov QWORD PTR[8+rbx],r15 + mov QWORD PTR[16+rbx],rax + mov rdi,r9 + mov QWORD PTR[24+rbx],r8 + mov QWORD PTR[32+rbx],r9 + mov QWORD PTR[40+rbx],r10 + mov rbp,r10 + + DB 0F3h,0C3h ;repret + +__mulx_mont_383_nonred ENDP +PUBLIC sqrx_mont_382x + + +ALIGN 32 +sqrx_mont_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_382x:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqrx_mont_382x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rsi + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rdx,r11 + adc r11,QWORD PTR[72+rsi] + mov rbx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rdx,QWORD PTR[72+rsi] + sbb rbx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rdi,rdi + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + mov QWORD PTR[((32+48))+rsp],r14 + mov QWORD PTR[((32+56))+rsp],r15 + mov QWORD PTR[((32+64))+rsp],rax + mov QWORD PTR[((32+72))+rsp],rdx + mov QWORD PTR[((32+80))+rsp],rbx + mov QWORD PTR[((32+88))+rsp],rbp + mov QWORD PTR[((32+96))+rsp],rdi + + + + lea rbx,QWORD PTR[48+rsi] + + mov rdx,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_383_nonred + add rdx,rdx + adc r15,r15 + adc rax,rax + adc r12,r12 + adc rdi,rdi + adc rbp,rbp + + mov QWORD PTR[48+rbx],rdx + mov QWORD PTR[56+rbx],r15 + mov QWORD PTR[64+rbx],rax + mov QWORD PTR[72+rbx],r12 + mov QWORD PTR[80+rbx],rdi + mov QWORD PTR[88+rbx],rbp + + lea rsi,QWORD PTR[((32-128))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rdx,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov rax,QWORD PTR[((32+16))+rsp] + mov r12,QWORD PTR[((32+24))+rsp] + mov rdi,QWORD PTR[((32+32))+rsp] + mov rbp,QWORD PTR[((32+40))+rsp] + + + + mulx r9,r8,r14 + call __mulx_mont_383_nonred + mov r14,QWORD PTR[((32+96))+rsp] + lea rcx,QWORD PTR[128+rcx] + mov r8,QWORD PTR[((32+0))+rsp] + and r8,r14 + mov r9,QWORD PTR[((32+8))+rsp] + and r9,r14 + mov r10,QWORD PTR[((32+16))+rsp] + and r10,r14 + mov r11,QWORD PTR[((32+24))+rsp] + and r11,r14 + mov r13,QWORD PTR[((32+32))+rsp] + and r13,r14 + and r14,QWORD PTR[((32+40))+rsp] + + sub rdx,r8 + mov r8,QWORD PTR[rcx] + sbb r15,r9 + mov r9,QWORD PTR[8+rcx] + sbb rax,r10 + mov r10,QWORD PTR[16+rcx] + sbb r12,r11 + mov r11,QWORD PTR[24+rcx] + sbb rdi,r13 + mov r13,QWORD PTR[32+rcx] + sbb rbp,r14 + sbb r14,r14 + + and r8,r14 + and r9,r14 + and r10,r14 + and r11,r14 + and r13,r14 + and r14,QWORD PTR[40+rcx] + + add rdx,r8 + adc r15,r9 + adc rax,r10 + adc r12,r11 + adc rdi,r13 + adc rbp,r14 + + mov QWORD PTR[rbx],rdx + mov QWORD PTR[8+rbx],r15 + mov QWORD PTR[16+rbx],rax + mov QWORD PTR[24+rbx],r12 + mov QWORD PTR[32+rbx],rdi + mov QWORD PTR[40+rbx],rbp + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqrx_mont_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_382x:: +sqrx_mont_382x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mulx_mont_384x + DD imagerel $L$SEH_body_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_prologue + + DD imagerel $L$SEH_body_mulx_mont_384x + DD imagerel $L$SEH_epilogue_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_body + + DD imagerel $L$SEH_epilogue_mulx_mont_384x + DD imagerel $L$SEH_end_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_384x + DD imagerel $L$SEH_body_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_prologue + + DD imagerel $L$SEH_body_sqrx_mont_384x + DD imagerel $L$SEH_epilogue_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_384x + DD imagerel $L$SEH_end_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mulx_382x + DD imagerel $L$SEH_body_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_prologue + + DD imagerel $L$SEH_body_mulx_382x + DD imagerel $L$SEH_epilogue_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_body + + DD imagerel $L$SEH_epilogue_mulx_382x + DD imagerel $L$SEH_end_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_epilogue + + DD imagerel $L$SEH_begin_sqrx_382x + DD imagerel $L$SEH_body_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_prologue + + DD imagerel $L$SEH_body_sqrx_382x + DD imagerel $L$SEH_epilogue_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_body + + DD imagerel $L$SEH_epilogue_sqrx_382x + DD imagerel $L$SEH_end_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_epilogue + + DD imagerel $L$SEH_begin_mulx_384 + DD imagerel $L$SEH_body_mulx_384 + DD imagerel $L$SEH_info_mulx_384_prologue + + DD imagerel $L$SEH_body_mulx_384 + DD imagerel $L$SEH_epilogue_mulx_384 + DD imagerel $L$SEH_info_mulx_384_body + + DD imagerel $L$SEH_epilogue_mulx_384 + DD imagerel $L$SEH_end_mulx_384 + DD imagerel $L$SEH_info_mulx_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_384 + DD imagerel $L$SEH_body_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_prologue + + DD imagerel $L$SEH_body_sqrx_384 + DD imagerel $L$SEH_epilogue_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_body + + DD imagerel $L$SEH_epilogue_sqrx_384 + DD imagerel $L$SEH_end_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_epilogue + + DD imagerel $L$SEH_begin_redcx_mont_384 + DD imagerel $L$SEH_body_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_prologue + + DD imagerel $L$SEH_body_redcx_mont_384 + DD imagerel $L$SEH_epilogue_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_body + + DD imagerel $L$SEH_epilogue_redcx_mont_384 + DD imagerel $L$SEH_end_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_epilogue + + DD imagerel $L$SEH_begin_fromx_mont_384 + DD imagerel $L$SEH_body_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_prologue + + DD imagerel $L$SEH_body_fromx_mont_384 + DD imagerel $L$SEH_epilogue_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_body + + DD imagerel $L$SEH_epilogue_fromx_mont_384 + DD imagerel $L$SEH_end_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0x_pty_mont_384 + DD imagerel $L$SEH_body_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_prologue + + DD imagerel $L$SEH_body_sgn0x_pty_mont_384 + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_body + + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 + DD imagerel $L$SEH_end_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0x_pty_mont_384x + DD imagerel $L$SEH_body_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_prologue + + DD imagerel $L$SEH_body_sgn0x_pty_mont_384x + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_body + + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x + DD imagerel $L$SEH_end_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mulx_mont_384 + DD imagerel $L$SEH_body_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_prologue + + DD imagerel $L$SEH_body_mulx_mont_384 + DD imagerel $L$SEH_epilogue_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_body + + DD imagerel $L$SEH_epilogue_mulx_mont_384 + DD imagerel $L$SEH_end_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_384 + DD imagerel $L$SEH_body_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_prologue + + DD imagerel $L$SEH_body_sqrx_mont_384 + DD imagerel $L$SEH_epilogue_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_384 + DD imagerel $L$SEH_end_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_prologue + + DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_end_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_prologue + + DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_body + + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_end_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_382x + DD imagerel $L$SEH_body_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_prologue + + DD imagerel $L$SEH_body_sqrx_mont_382x + DD imagerel $L$SEH_epilogue_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_382x + DD imagerel $L$SEH_end_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mulx_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,029h,000h +DB 000h,0e4h,02ah,000h +DB 000h,0d4h,02bh,000h +DB 000h,0c4h,02ch,000h +DB 000h,034h,02dh,000h +DB 000h,054h,02eh,000h +DB 000h,074h,030h,000h +DB 000h,064h,031h,000h +DB 000h,001h,02fh,000h +$L$SEH_info_mulx_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqrx_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_mulx_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_382x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,000h,000h +DB 000h,0e4h,001h,000h +DB 000h,0d4h,002h,000h +DB 000h,0c4h,003h,000h +DB 000h,034h,004h,000h +DB 000h,054h,005h,000h +DB 000h,074h,007h,000h +DB 000h,064h,008h,000h +DB 000h,052h +DB 000h,000h +$L$SEH_info_mulx_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sqrx_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redcx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_redcx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_redcx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_fromx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_fromx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_fromx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0x_pty_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0x_pty_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0x_pty_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0x_pty_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sgn0x_pty_mont_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h +$L$SEH_info_sgn0x_pty_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_mulx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_mulx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h +$L$SEH_info_sqrx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_n_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_n_mul_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,005h,000h +DB 000h,0e4h,006h,000h +DB 000h,0d4h,007h,000h +DB 000h,0c4h,008h,000h +DB 000h,034h,009h,000h +DB 000h,054h,00ah,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h +$L$SEH_info_sqrx_n_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_n_mul_mont_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_n_mul_mont_383_body:: +DB 1,0,17,0 +DB 000h,0f4h,005h,000h +DB 000h,0e4h,006h,000h +DB 000h,0d4h,007h,000h +DB 000h,0c4h,008h,000h +DB 000h,034h,009h,000h +DB 000h,054h,00ah,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h +$L$SEH_info_sqrx_n_mul_mont_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_sqrx_mont_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +$L$SEH_info_sqrx_mont_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/sha256-armv8.asm b/crypto/blst_src/build/win64/sha256-armv8.asm new file mode 100644 index 00000000000..0e0c54cb65b --- /dev/null +++ b/crypto/blst_src/build/win64/sha256-armv8.asm @@ -0,0 +1,1078 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + + AREA |.text|,CODE,ALIGN=8,ARM64 + + ALIGN 64 + +|$LK256| + DCDU 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DCDU 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DCDU 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DCDU 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DCDU 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DCDU 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DCDU 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DCDU 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DCDU 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DCDU 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DCDU 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DCDU 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DCDU 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DCDU 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DCDU 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DCDU 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + DCDU 0 //terminator + + DCB "SHA256 block transform for ARMv8, CRYPTOGAMS by @dot-asm",0 + ALIGN 4 + ALIGN 4 + + EXPORT |blst_sha256_block_armv8|[FUNC] + ALIGN 64 +|blst_sha256_block_armv8| PROC +|$Lv8_entry| + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,|$LK256| + +|$Loop_hw| + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,|$Loop_hw| + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + ENDP + + EXPORT |blst_sha256_block_data_order|[FUNC] + ALIGN 16 +|blst_sha256_block_data_order| PROC + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,|$LK256| + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b |$L_00_48| + + ALIGN 16 +|$L_00_48| + ext8 v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext8 v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext8 v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext8 v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext8 v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne |$L_00_48| + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + cseleq x17,x17,xzr + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + bne |$L_00_48| + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + ENDP + + + EXPORT |blst_sha256_emit|[FUNC] + ALIGN 16 +|blst_sha256_emit| PROC + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + ENDP + + + + EXPORT |blst_sha256_bcopy|[FUNC] + ALIGN 16 +|blst_sha256_bcopy| PROC +|$Loop_bcopy| + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,|$Loop_bcopy| + ret + ENDP + + + + EXPORT |blst_sha256_hcopy|[FUNC] + ALIGN 16 +|blst_sha256_hcopy| PROC + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/sha256-x86_64.asm b/crypto/blst_src/build/win64/sha256-x86_64.asm new file mode 100644 index 00000000000..d3b409235e7 --- /dev/null +++ b/crypto/blst_src/build/win64/sha256-x86_64.asm @@ -0,0 +1,1570 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +ALIGN 64 + +K256:: + DD 0428a2f98h,071374491h,0b5c0fbcfh,0e9b5dba5h + DD 03956c25bh,059f111f1h,0923f82a4h,0ab1c5ed5h + DD 0d807aa98h,012835b01h,0243185beh,0550c7dc3h + DD 072be5d74h,080deb1feh,09bdc06a7h,0c19bf174h + DD 0e49b69c1h,0efbe4786h,00fc19dc6h,0240ca1cch + DD 02de92c6fh,04a7484aah,05cb0a9dch,076f988dah + DD 0983e5152h,0a831c66dh,0b00327c8h,0bf597fc7h + DD 0c6e00bf3h,0d5a79147h,006ca6351h,014292967h + DD 027b70a85h,02e1b2138h,04d2c6dfch,053380d13h + DD 0650a7354h,0766a0abbh,081c2c92eh,092722c85h + DD 0a2bfe8a1h,0a81a664bh,0c24b8b70h,0c76c51a3h + DD 0d192e819h,0d6990624h,0f40e3585h,0106aa070h + DD 019a4c116h,01e376c08h,02748774ch,034b0bcb5h + DD 0391c0cb3h,04ed8aa4ah,05b9cca4fh,0682e6ff3h + DD 0748f82eeh,078a5636fh,084c87814h,08cc70208h + DD 090befffah,0a4506cebh,0bef9a3f7h,0c67178f2h + + DD 000010203h,004050607h,008090a0bh,00c0d0e0fh + DD 003020100h,00b0a0908h,0ffffffffh,0ffffffffh + DD 0ffffffffh,0ffffffffh,003020100h,00b0a0908h +DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 +DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +DB 32,64,100,111,116,45,97,115,109,0 +PUBLIC blst_sha256_block_data_order_shaext + + +ALIGN 64 +blst_sha256_block_data_order_shaext PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_blst_sha256_block_data_order_shaext:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + sub rsp,058h + + movaps XMMWORD PTR[(-88)+r11],xmm6 + + movaps XMMWORD PTR[(-72)+r11],xmm7 + + movaps XMMWORD PTR[(-56)+r11],xmm8 + + movaps XMMWORD PTR[(-40)+r11],xmm9 + + movaps XMMWORD PTR[(-24)+r11],xmm10 + +$L$SEH_body_blst_sha256_block_data_order_shaext:: + + lea rcx,QWORD PTR[((K256+128))] + movdqu xmm1,XMMWORD PTR[rdi] + movdqu xmm2,XMMWORD PTR[16+rdi] + movdqa xmm7,XMMWORD PTR[((256-128))+rcx] + + pshufd xmm0,xmm1,01bh + pshufd xmm1,xmm1,0b1h + pshufd xmm2,xmm2,01bh + movdqa xmm8,xmm7 +DB 102,15,58,15,202,8 + punpcklqdq xmm2,xmm0 + jmp $L$oop_shaext + +ALIGN 16 +$L$oop_shaext:: + movdqu xmm3,XMMWORD PTR[rsi] + movdqu xmm4,XMMWORD PTR[16+rsi] + movdqu xmm5,XMMWORD PTR[32+rsi] +DB 102,15,56,0,223 + movdqu xmm6,XMMWORD PTR[48+rsi] + + movdqa xmm0,XMMWORD PTR[((0-128))+rcx] + paddd xmm0,xmm3 +DB 102,15,56,0,231 + movdqa xmm10,xmm2 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + nop + movdqa xmm9,xmm1 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((16-128))+rcx] + paddd xmm0,xmm4 +DB 102,15,56,0,239 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + lea rsi,QWORD PTR[64+rsi] +DB 15,56,204,220 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((32-128))+rcx] + paddd xmm0,xmm5 +DB 102,15,56,0,247 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((48-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((64-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((80-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 +DB 15,56,204,220 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((96-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,205,245 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((112-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((128-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((144-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 +DB 15,56,204,220 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((160-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,205,245 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((176-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((192-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((208-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 +DB 15,56,203,202 + paddd xmm6,xmm7 + + movdqa xmm0,XMMWORD PTR[((224-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh +DB 15,56,205,245 + movdqa xmm7,xmm8 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((240-128))+rcx] + paddd xmm0,xmm6 + nop +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + dec rdx + nop +DB 15,56,203,202 + + paddd xmm2,xmm10 + paddd xmm1,xmm9 + jnz $L$oop_shaext + + pshufd xmm2,xmm2,0b1h + pshufd xmm7,xmm1,01bh + pshufd xmm1,xmm1,0b1h + punpckhqdq xmm1,xmm2 +DB 102,15,58,15,215,8 + + movdqu XMMWORD PTR[rdi],xmm1 + movdqu XMMWORD PTR[16+rdi],xmm2 + movaps xmm6,XMMWORD PTR[((-88))+r11] + movaps xmm7,XMMWORD PTR[((-72))+r11] + movaps xmm8,XMMWORD PTR[((-56))+r11] + movaps xmm9,XMMWORD PTR[((-40))+r11] + movaps xmm10,XMMWORD PTR[((-24))+r11] + mov rsp,r11 + +$L$SEH_epilogue_blst_sha256_block_data_order_shaext:: + mov rdi,QWORD PTR[8+r11] ;WIN64 epilogue + mov rsi,QWORD PTR[16+r11] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_blst_sha256_block_data_order_shaext:: +blst_sha256_block_data_order_shaext ENDP +PUBLIC blst_sha256_block_data_order + + +ALIGN 64 +blst_sha256_block_data_order PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_blst_sha256_block_data_order:: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,104 + + lea rdx,QWORD PTR[rdx*4+rsi] + mov QWORD PTR[rsp],rdi + + mov QWORD PTR[16+rsp],rdx + movaps XMMWORD PTR[32+rsp],xmm6 + + movaps XMMWORD PTR[48+rsp],xmm7 + + movaps XMMWORD PTR[64+rsp],xmm8 + + movaps XMMWORD PTR[80+rsp],xmm9 + + mov rbp,rsp + +$L$SEH_body_blst_sha256_block_data_order:: + + + lea rsp,QWORD PTR[((-64))+rsp] + mov eax,DWORD PTR[rdi] + and rsp,-64 + mov ebx,DWORD PTR[4+rdi] + mov ecx,DWORD PTR[8+rdi] + mov edx,DWORD PTR[12+rdi] + mov r8d,DWORD PTR[16+rdi] + mov r9d,DWORD PTR[20+rdi] + mov r10d,DWORD PTR[24+rdi] + mov r11d,DWORD PTR[28+rdi] + + + jmp $L$loop_ssse3 +ALIGN 16 +$L$loop_ssse3:: + movdqa xmm7,XMMWORD PTR[((K256+256))] + mov QWORD PTR[8+rbp],rsi + movdqu xmm0,XMMWORD PTR[rsi] + movdqu xmm1,XMMWORD PTR[16+rsi] + movdqu xmm2,XMMWORD PTR[32+rsi] +DB 102,15,56,0,199 + movdqu xmm3,XMMWORD PTR[48+rsi] + lea rsi,QWORD PTR[K256] +DB 102,15,56,0,207 + movdqa xmm4,XMMWORD PTR[rsi] + movdqa xmm5,XMMWORD PTR[16+rsi] +DB 102,15,56,0,215 + paddd xmm4,xmm0 + movdqa xmm6,XMMWORD PTR[32+rsi] +DB 102,15,56,0,223 + movdqa xmm7,XMMWORD PTR[48+rsi] + paddd xmm5,xmm1 + paddd xmm6,xmm2 + paddd xmm7,xmm3 + movdqa XMMWORD PTR[rsp],xmm4 + mov r14d,eax + movdqa XMMWORD PTR[16+rsp],xmm5 + mov edi,ebx + movdqa XMMWORD PTR[32+rsp],xmm6 + xor edi,ecx + movdqa XMMWORD PTR[48+rsp],xmm7 + mov r13d,r8d + jmp $L$ssse3_00_47 + +ALIGN 16 +$L$ssse3_00_47:: + sub rsi,-64 + ror r13d,14 + movdqa xmm4,xmm1 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm3 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,224,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,250,4 + add r11d,DWORD PTR[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm0,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm3,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD PTR[4+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm0,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD PTR[8+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm0,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm0,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[12+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD PTR[rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm0,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm0 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD PTR[rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm2 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm0 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,225,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,251,4 + add edx,DWORD PTR[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm1,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm0,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD PTR[20+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm1,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD PTR[24+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm1,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm1,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[28+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD PTR[16+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm1,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm1 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD PTR[16+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm3 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm1 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,226,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,248,4 + add r11d,DWORD PTR[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm2,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm1,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD PTR[36+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm2,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD PTR[40+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm2,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm2,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[44+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD PTR[32+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm2,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm2 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD PTR[32+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm0 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm2 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,227,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,249,4 + add edx,DWORD PTR[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm3,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm2,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD PTR[52+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm3,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD PTR[56+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm3,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm3,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[60+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD PTR[48+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm3,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm3 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD PTR[48+rsp],xmm6 + cmp BYTE PTR[67+rsi],0 + jne $L$ssse3_00_47 + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD PTR[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD PTR[4+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD PTR[8+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[12+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD PTR[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD PTR[20+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD PTR[24+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[28+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD PTR[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD PTR[36+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD PTR[40+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[44+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD PTR[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD PTR[52+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD PTR[56+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[60+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD PTR[rbp] + mov eax,r14d + mov rsi,QWORD PTR[8+rbp] + + add eax,DWORD PTR[rdi] + add ebx,DWORD PTR[4+rdi] + add ecx,DWORD PTR[8+rdi] + add edx,DWORD PTR[12+rdi] + add r8d,DWORD PTR[16+rdi] + add r9d,DWORD PTR[20+rdi] + add r10d,DWORD PTR[24+rdi] + add r11d,DWORD PTR[28+rdi] + + lea rsi,QWORD PTR[64+rsi] + cmp rsi,QWORD PTR[16+rbp] + + mov DWORD PTR[rdi],eax + mov DWORD PTR[4+rdi],ebx + mov DWORD PTR[8+rdi],ecx + mov DWORD PTR[12+rdi],edx + mov DWORD PTR[16+rdi],r8d + mov DWORD PTR[20+rdi],r9d + mov DWORD PTR[24+rdi],r10d + mov DWORD PTR[28+rdi],r11d + jb $L$loop_ssse3 + + xorps xmm0,xmm0 + lea r11,QWORD PTR[((104+48))+rbp] + + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps xmm6,XMMWORD PTR[32+rbp] + movaps xmm7,XMMWORD PTR[48+rbp] + movaps xmm8,XMMWORD PTR[64+rbp] + movaps xmm9,XMMWORD PTR[80+rbp] + mov r15,QWORD PTR[104+rbp] + + mov r14,QWORD PTR[((-40))+r11] + + mov r13,QWORD PTR[((-32))+r11] + + mov r12,QWORD PTR[((-24))+r11] + + mov rbx,QWORD PTR[((-16))+r11] + + mov rbp,QWORD PTR[((-8))+r11] + +$L$SEH_epilogue_blst_sha256_block_data_order:: + mov rdi,QWORD PTR[8+r11] ;WIN64 epilogue + mov rsi,QWORD PTR[16+r11] + + lea rsp,QWORD PTR[r11] + DB 0F3h,0C3h ;repret + +$L$SEH_end_blst_sha256_block_data_order:: +blst_sha256_block_data_order ENDP +PUBLIC blst_sha256_emit + + +ALIGN 16 +blst_sha256_emit PROC PUBLIC + DB 243,15,30,250 + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + bswap r8 + mov r11,QWORD PTR[24+rdx] + bswap r9 + mov DWORD PTR[4+rcx],r8d + bswap r10 + mov DWORD PTR[12+rcx],r9d + bswap r11 + mov DWORD PTR[20+rcx],r10d + shr r8,32 + mov DWORD PTR[28+rcx],r11d + shr r9,32 + mov DWORD PTR[rcx],r8d + shr r10,32 + mov DWORD PTR[8+rcx],r9d + shr r11,32 + mov DWORD PTR[16+rcx],r10d + mov DWORD PTR[24+rcx],r11d + DB 0F3h,0C3h ;repret +blst_sha256_emit ENDP + +PUBLIC blst_sha256_bcopy + + +ALIGN 16 +blst_sha256_bcopy PROC PUBLIC + DB 243,15,30,250 + sub rcx,rdx +$L$oop_bcopy:: + movzx eax,BYTE PTR[rdx] + lea rdx,QWORD PTR[1+rdx] + mov BYTE PTR[((-1))+rdx*1+rcx],al + dec r8 + jnz $L$oop_bcopy + DB 0F3h,0C3h ;repret +blst_sha256_bcopy ENDP + +PUBLIC blst_sha256_hcopy + + +ALIGN 16 +blst_sha256_hcopy PROC PUBLIC + DB 243,15,30,250 + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov QWORD PTR[rcx],r8 + mov QWORD PTR[8+rcx],r9 + mov QWORD PTR[16+rcx],r10 + mov QWORD PTR[24+rcx],r11 + DB 0F3h,0C3h ;repret +blst_sha256_hcopy ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_prologue + + DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_body + + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_end_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue + + DD imagerel $L$SEH_begin_blst_sha256_block_data_order + DD imagerel $L$SEH_body_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_prologue + + DD imagerel $L$SEH_body_blst_sha256_block_data_order + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_body + + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order + DD imagerel $L$SEH_end_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_blst_sha256_block_data_order_shaext_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_blst_sha256_block_data_order_shaext_body:: +DB 1,0,15,0 +DB 000h,068h,000h,000h +DB 000h,078h,001h,000h +DB 000h,088h,002h,000h +DB 000h,098h,003h,000h +DB 000h,0a8h,004h,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_blst_sha256_block_data_order_shaext_epilogue:: +DB 1,0,5,11 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,003h +DB 000h,000h + +$L$SEH_info_blst_sha256_block_data_order_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,003h +DB 0,0 +$L$SEH_info_blst_sha256_block_data_order_body:: +DB 1,0,26,5 +DB 000h,068h,002h,000h +DB 000h,078h,003h,000h +DB 000h,088h,004h,000h +DB 000h,098h,005h,000h +DB 000h,0f4h,00dh,000h +DB 000h,0e4h,00eh,000h +DB 000h,0d4h,00fh,000h +DB 000h,0c4h,010h,000h +DB 000h,034h,011h,000h +DB 000h,074h,014h,000h +DB 000h,064h,015h,000h +DB 000h,003h +DB 000h,001h,012h,000h +DB 000h,050h +$L$SEH_info_blst_sha256_block_data_order_epilogue:: +DB 1,0,5,11 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,003h +DB 000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/bulk_addition.c b/crypto/blst_src/bulk_addition.c new file mode 100644 index 00000000000..81afc530665 --- /dev/null +++ b/crypto/blst_src/bulk_addition.c @@ -0,0 +1,168 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * This implementation uses explicit addition formula: + * + * λ = (Y₂-Y₁)/(X₂-X₁) + * X₃ = λ²-(X₁+X₂) + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * But since we don't know if we'll have to add point to itself, we need + * to eventually resort to corresponding doubling formula: + * + * λ = 3X₁²/2Y₁ + * X₃ = λ²-2X₁ + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * The formulae use prohibitively expensive inversion, but whenever we + * have a lot of affine points to accumulate, we can amortize the cost + * by applying Montgomery's batch inversion approach. As a result, + * asymptotic[!] per-point cost for addition is as small as 5M+1S. For + * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things + * considered, the improvement coefficient varies from 60% to 85% + * depending on platform and curve. + * + * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an + * application that requires constant time-ness, speak up!] + */ + +/* + * Calculate λ's numerator and denominator. + * + * input: A x1 y1 - + * B x2 y2 - + * output: + * if A!=B: A x1 y1 (x2-x1)*mul_acc + * B x2+x1 y2-y1 (x2-x1) + * + * if A==B: A x y 2y*mul_acc + * B 2x 3*x^2 2y + * + * if A==-B: A 0 0 1*mul_acc + * B 0 3*x^2 0 + */ +#define HEAD(ptype, bits, field, one) \ +static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \ +{ \ + ptype *A = AB, *B = AB+1; \ + limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \ + vec_is_zero(B, sizeof(ptype##_affine)); \ + static const vec##bits zero = { 0 }; \ +\ + sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \ + add_##field(B->X, B->X, A->X); /* X2+X1 */ \ + add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \ + sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \ + if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \ + inf = vec_is_zero(A->Z, sizeof(A->Z)); \ + vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \ + sqr_##field(B->Y, A->X); \ + mul_by_3_##field(B->Y, B->Y); /* 3*X1^2 */ \ + vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2*Y1 */ \ + } /* B->Y is numenator */ \ + /* B->Z is denominator */ \ + vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \ + vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \ + vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \ + vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \ + if (mul_acc != NULL) \ + mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\ +} + +/* + * Calculate λ and resulting coordinates. + * + * input: A x1 y1 - + * B x2+x1 nominator - + * lambda 1/denominator + * output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1 + */ +#define TAIL(ptype, bits, field, one) \ +static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \ +{ \ + ptype *A = AB, *B = AB+1; \ + vec##bits llambda; \ + limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \ +\ + mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \ + /* alt. 3*X1^2/2*Y1 */ \ + sqr_##field(llambda, lambda); \ + sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \ +\ + sub_##field(D->Y, A->X, D->X); \ + mul_##field(D->Y, D->Y, lambda); \ + sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ*(X1-X3)-Y1 */ \ +\ + vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \ + vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \ +} + +/* + * |points[]| is volatile buffer with |X|s and |Y|s initially holding + * input affine coordinates, and with |Z|s being used as additional + * temporary storage [unrelated to Jacobian coordinates]. |sum| is + * in-/output, initialize to infinity accordingly. + */ +#define ADDITION_BTREE(prefix, ptype, bits, field, one) \ +HEAD(ptype, bits, field, one) \ +TAIL(ptype, bits, field, one) \ +static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \ +{ \ + ptype *dst; \ + void *mul_acc; \ + size_t i; \ +\ + while (n >= 16) { \ + if (n & 1) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ + n /= 2; \ + for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \ + ptype##_head(points, mul_acc); \ +\ + reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \ +\ + for (dst = points, i = n; --i;) { \ + dst--; points -= 2; \ + mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \ + ptype##_tail(dst, points, points[-2].Z); \ + mul_##field(points[-2].Z, points[0].Z, points[1].Z); \ + } \ + dst--; points -= 2; \ + ptype##_tail(dst, points, points[0].Z); \ + points = dst; \ + } \ + while (n--) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ +} \ +\ +void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + /* Performance with 288K scratch is within 1-2-3% from optimal */ \ + const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \ + ptype *scratch = alloca((npoints > stride ? stride : npoints) * \ + sizeof(ptype)); \ + const ptype##_affine *point = NULL; \ +\ + vec_zero(sum, sizeof(*sum)); \ + while (npoints) { \ + size_t i, j = npoints > stride ? stride : npoints; \ + for (i=0; i> (8 * (n % sizeof(limb_t)))); + } +} + +static inline void limbs_from_le_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= in[n]; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + limb_t limb; + size_t i, j, r; + + if ((uptr_t)out == (uptr_t)in && is_endian.little) + return; + + r = n % sizeof(limb_t); + n /= sizeof(limb_t); + + for(i = 0; i < n; i++) { + for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8) + *out++ = (unsigned char)limb; + } + if (r) { + for (limb = in[i], j = 0; j < r; j++, limb >>= 8) + *out++ = (unsigned char)limb; + } +} + +static inline char hex_from_nibble(unsigned char nibble) +{ + int mask = (9 - (nibble &= 0xf)) >> 31; + return (char)(nibble + ((('a'-10) & mask) | ('0' & ~mask))); +} + +static unsigned char nibble_from_hex(char c) +{ + int mask, ret; + + mask = (('a'-c-1) & (c-1-'f')) >> 31; + ret = (10 + c - 'a') & mask; + mask = (('A'-c-1) & (c-1-'F')) >> 31; + ret |= (10 + c - 'A') & mask; + mask = (('0'-c-1) & (c-1-'9')) >> 31; + ret |= (c - '0') & mask; + mask = ((ret-1) & ~mask) >> 31; + ret |= 16 & mask; + + return (unsigned char)ret; +} + +static void bytes_from_hexascii(unsigned char *ret, size_t sz, const char *hex) +{ + size_t len; + unsigned char b = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; + + bytes_zero(ret, sz); + + while(len--) { + b <<= 4; + b |= nibble_from_hex(*hex++); + if (len % 2 == 0) + ret[len / 2] = b; + } +} + +static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex) +{ + size_t len; + limb_t limb = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; + + vec_zero(ret, sz); + + while(len--) { + limb <<= 4; + limb |= nibble_from_hex(*hex++); + if (len % (2*sizeof(limb_t)) == 0) + ret[len / (2*sizeof(limb_t))] = limb; + } +} + +#endif diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c new file mode 100644 index 00000000000..0fcf563f502 --- /dev/null +++ b/crypto/blst_src/client_min_pk.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e2.c" +#include "hash_to_field.c" +#include "map_to_g2.c" +#include "e1.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c new file mode 100644 index 00000000000..8e4663daede --- /dev/null +++ b/crypto/blst_src/client_min_sig.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e1.c" +#include "hash_to_field.c" +#include "map_to_g1.c" +#include "e2.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/crypto/blst_src/consts.c b/crypto/blst_src/consts.c new file mode 100644 index 00000000000..021c878a258 --- /dev/null +++ b/crypto/blst_src/consts.c @@ -0,0 +1,36 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" + +/* z = -0xd201000000010000 */ +const vec384 BLS12_381_P = { /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */ + TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff), + TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf), + TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a) +}; +const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ + +const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */ + { { ONE_MONT_P }, + { 0 } } +}; + +const vec384 BLS12_381_RR = { /* (1<<768)%P, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1), + TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0), + TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa) +}; + +const vec256 BLS12_381_r = { /* z^4 - z^2 + 1, group order */ + TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe), + TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48) +}; + +const vec256 BLS12_381_rRR = { /* (1<<512)%r, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23), + TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11) +}; diff --git a/crypto/blst_src/consts.h b/crypto/blst_src/consts.h new file mode 100644 index 00000000000..cb391b817df --- /dev/null +++ b/crypto/blst_src/consts.h @@ -0,0 +1,30 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_CONST_H__ +#define __BLS12_381_ASM_CONST_H__ +#include "vect.h" + +extern const vec384 BLS12_381_P; +extern const limb_t BLS12_381_p0; +static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ +typedef union { vec384 p12[12]; vec384x p2; vec384 p; } radix384; +extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */ +extern const vec384 BLS12_381_RR; /* (1<<768)%P, "radix"^2, to-Montgomery */ + +#define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \ + TO_LIMB_T(0xebf4000bc40c0002), \ + TO_LIMB_T(0x5f48985753c758ba), \ + TO_LIMB_T(0x77ce585370525745), \ + TO_LIMB_T(0x5c071a97a256ec6d), \ + TO_LIMB_T(0x15f65ec3fa80e493) + +#define ZERO_384 (BLS12_381_Rx.p2[1]) + +extern const vec256 BLS12_381_r; /* order */ +static const limb_t r0 = (limb_t)0xfffffffeffffffff; /* -1/r */ +extern const vec256 BLS12_381_rRR; /* (1<<512)%r, "radix"^2, to-Montgomery */ + +#endif diff --git a/crypto/blst_src/e1.c b/crypto/blst_src/e1.c new file mode 100644 index 00000000000..91c4cdbf39c --- /dev/null +++ b/crypto/blst_src/e1.c @@ -0,0 +1,564 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384 B_E1 = { /* (4 << 384) % P */ + TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) +}; + +const POINTonE1 BLS12_381_G1 = { /* generator point [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6 + * 00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */ + { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e), + TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194), + TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) }, + { ONE_MONT_P } +}; + +const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9 + * 6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */ + { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270), + TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a), + TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) }, + { ONE_MONT_P } +}; + +static inline void mul_by_b_onE1(vec384 out, const vec384 in) +{ lshift_fp(out, in, 2); } + +static inline void mul_by_4b_onE1(vec384 out, const vec384 in) +{ lshift_fp(out, in, 4); } + +static void POINTonE1_cneg(POINTonE1 *p, bool_t cbit) +{ cneg_fp(p->Y, p->Y, cbit); } + +void blst_p1_cneg(POINTonE1 *a, int cbit) +{ POINTonE1_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in) +{ + vec384 Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp(Z, in->Z); /* 1/Z */ + + sqr_fp(ZZ, Z); + mul_fp(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G1.Z, + sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_from_Jacobian(out, a); } + +static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a) +{ POINTonE1_to_affine(out, a); } + +void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p) +{ + vec384 XXX, YY; + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, B_E1); /* X^3 + B */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p1_affine_on_curve(const POINTonE1_affine *p) +{ return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE1_on_curve(const POINTonE1 *p) +{ + vec384 XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp(BZ6, p->Z); + mul_fp(BZ6, BZ6, p->Z); + sqr_fp(BZ6, BZ6); /* Z^6 */ + mul_by_b_onE1(BZ6, BZ6); /* B*Z^6 */ + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p1_on_curve(const POINTonE1 *p) +{ return (int)POINTonE1_on_curve(p); } + +static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + from_fp(temp, in->Y); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mod_384(temp, BLS12_381_P); +} + +void blst_p1_affine_serialize(unsigned char out[96], + const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE1_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE1_Serialize_BE(unsigned char out[96], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in); +} + +static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE1_Serialize_BE(out, in); + } +} + +void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in) +{ POINTonE1_Serialize(out, in); } + +static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0); +} + +void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE1_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Compress_BE(unsigned char out[48], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in); +} + +void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE1_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out, + const unsigned char in[48]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + mul_fp(ret.X, ret.X, BLS12_381_RR); + + sqr_fp(ret.Y, ret.X); + mul_fp(ret.Y, ret.Y, ret.X); + add_fp(ret.Y, ret.Y, B_E1); /* X^3 + B */ + if (!sqrt_fp(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out, + const unsigned char in[48]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE1_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp(out->Y, out->Y, sgn0_pty); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48]) +{ return POINTonE1_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out, + const unsigned char in[96]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y)); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y, sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X, ret.X, BLS12_381_RR); + mul_fp(ret.Y, ret.Y, BLS12_381_RR); + + if (!POINTonE1_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE1_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE1_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out, + const unsigned char in[96]) +{ return POINTonE1_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE1, 384, fp) +POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_ADD_IMPL(POINTonE1, 384, fp) +POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp) +POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp) + +void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) +{ POINTonE1_add(out, a, b); } + +void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1 *b) +{ POINTonE1_dadd(out, a, b, NULL); } + +void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_add_affine(out, a, b); } + +void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_dadd_affine(out, a, b); } + +void blst_p1_double(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_double(out, a); } + +int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b) +{ return (int)POINTonE1_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1) + +DECLARE_PRIVATE_POINTXZ(POINTonE1, 384) +POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp) +POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1) +#endif + +static const vec384 beta = { /* such that beta^3 - 1 = 0 */ + /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */ + /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */ + TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) +}; + +static void sigma(POINTonE1 *out, const POINTonE1 *in) +{ + vec_copy(out->X, in->X, 2*sizeof(out->X)); + mul_fp(out->Z, in->Z, beta); +} + +/* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */ +static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* SK/z^2 [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s+16, val.s }; + POINTonE1 table[2][1<<(5-1)]; /* 4.5KB */ + size_t i; + + POINTonE1_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + mul_fp(table[1][i].X, table[0][i].X, beta); + cneg_fp(table[1][i].Y, table[0][i].Y, 1); + vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z)); + } + + POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table); + POINTonE1_cneg(out, 1); + mul_fp(out->Z, out->Z, beta); + mul_fp(out->Z, out->Z, beta); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK) +{ + vec384 Z, ZZ; + limb_t inf; + + POINTonE1_mult_glv(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp(ZZ, Z); + mul_fp(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK) +{ POINTonE1_sign(out, &BLS12_381_G1, SK); } + +void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK) +{ POINTonE1_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK, + const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, &BLS12_381_G1, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig, + const POINTonE1 *hash, const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 176) { + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE1_mult_glv(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p1_affine_is_equal(const POINTonE1_affine *a, + const POINTonE1_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p1_is_inf(const POINTonE1 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE1 *blst_p1_generator(void) +{ return &BLS12_381_G1; } + +int blst_p1_affine_is_inf(const POINTonE1_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE1_affine *blst_p1_affine_generator(void) +{ return (const POINTonE1_affine *)&BLS12_381_G1; } + +size_t blst_p1_sizeof(void) +{ return sizeof(POINTonE1); } + +size_t blst_p1_affine_sizeof(void) +{ return sizeof(POINTonE1_affine); } diff --git a/crypto/blst_src/e2.c b/crypto/blst_src/e2.c new file mode 100644 index 00000000000..822ac23c694 --- /dev/null +++ b/crypto/blst_src/e2.c @@ -0,0 +1,638 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384x B_E2 = { /* 4 + 4*i */ + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }, + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) } +}; + +const POINTonE2 BLS12_381_G2 = { /* generator point [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7 + 6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */ + { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a), + TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f), + TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) }, + /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af + 267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */ + { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc), + TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a), + TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) }, +}, +{ { ONE_MONT_P }, { 0 } } +}; + +const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17 + f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */ + { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5), + TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f), + TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) }, + /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10 + 40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */ + { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23), + TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84), + TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) } +}, +{ { ONE_MONT_P }, { 0 } } +}; + +static void mul_by_b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 2); + lshift_fp(out[1], out[1], 2); +} + +static void mul_by_4b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 4); + lshift_fp(out[1], out[1], 4); +} + +static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit) +{ cneg_fp2(p->Y, p->Y, cbit); } + +void blst_p2_cneg(POINTonE2 *a, int cbit) +{ POINTonE2_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in) +{ + vec384x Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp2(Z, in->Z); /* 1/Z */ + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G2.Z, + sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_from_Jacobian(out, a); } + +static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a) +{ POINTonE2_to_affine(out, a); } + +void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p) +{ + vec384x XXX, YY; + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, B_E2); /* X^3 + B */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p2_affine_on_curve(const POINTonE2_affine *p) +{ return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE2_on_curve(const POINTonE2 *p) +{ + vec384x XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp2(BZ6, p->Z); + mul_fp2(BZ6, BZ6, p->Z); + sqr_fp2(XXX, BZ6); /* Z^6 */ + mul_by_b_onE2(BZ6, XXX); /* B*Z^6 */ + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p2_on_curve(const POINTonE2 *p) +{ return (int)POINTonE2_on_curve(p); } + +static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192], + const POINTonE2_affine *in) +{ + vec384x temp; + + from_fp(temp[1], in->X[1]); + be_bytes_from_limbs(out, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->X[0]); + be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0])); + + from_fp(temp[1], in->Y[1]); + be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->Y[0]); + be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0])); + + return sgn0_pty_mod_384x(temp, BLS12_381_P); +} + +void blst_p2_affine_serialize(unsigned char out[192], + const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE2_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE2_Serialize_BE(unsigned char out[192], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in); +} + +static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE2_Serialize_BE(out, in); + } +} + +void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in) +{ POINTonE2_Serialize(out, in); } + +static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96], + const POINTonE2_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X[1]); + be_bytes_from_limbs(out, temp, sizeof(temp)); + from_fp(temp, in->X[0]); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0); +} + +void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE2_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Compress_BE(unsigned char out[96], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in); +} + +void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE2_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out, + const unsigned char in[96]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + + sqr_fp2(ret.Y, ret.X); + mul_fp2(ret.Y, ret.Y, ret.X); + add_fp2(ret.Y, ret.Y, B_E2); /* X^3 + B */ + if (!sqrt_fp2(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE2_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp2(out->Y, out->Y, sgn0_pty); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96]) +{ return POINTonE2_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out, + const unsigned char in[192]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1])); + limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR); + mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR); + + if (!POINTonE2_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return BLST_SUCCESS; +} + +static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out, + const unsigned char in[192]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE2_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE2_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out, + const unsigned char in[192]) +{ return POINTonE2_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE2, 384x, fp2) +POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_ADD_IMPL(POINTonE2, 384x, fp2) +POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2) +POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2) + +void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) +{ POINTonE2_add(out, a, b); } + +void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2 *b) +{ POINTonE2_dadd(out, a, b, NULL); } + +void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_add_affine(out, a, b); } + +void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_dadd_affine(out, a, b); } + +void blst_p2_double(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_double(out, a); } + +int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b) +{ return (int)POINTonE2_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2) + +DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x) +POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2) +POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2) +#endif + +static void psi(POINTonE2 *out, const POINTonE2 *in) +{ + static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */ + { 0 }, + { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */ + TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) } + }; + static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */ + { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60 + ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */ + TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e + 77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */ + TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + }; + + vec_copy(out, in, sizeof(*out)); + cneg_fp(out->X[1], out->X[1], 1); mul_fp2(out->X, out->X, frobenius_x); + cneg_fp(out->Y[1], out->Y[1], 1); mul_fp2(out->Y, out->Y, frobenius_y); + cneg_fp(out->Z[1], out->Z[1], 1); +} + +/* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */ +static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* break down SK to "digits" with |z| as radix [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + div_by_z(val.l); + div_by_z(val.l + NLIMBS(256)/2); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s, NULL }; + POINTonE2 table[4][1<<(5-1)]; /* 18KB */ + size_t i; + + POINTonE2_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + psi(&table[1][i], &table[0][i]); + psi(&table[2][i], &table[1][i]); + psi(&table[3][i], &table[2][i]); + POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */ + POINTonE2_cneg(&table[3][i], 1); + } + + POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK) +{ + vec384x Z, ZZ; + limb_t inf; + + POINTonE2_mult_gls(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp2(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp2(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK) +{ POINTonE2_sign(out, &BLS12_381_G2, SK); } + +void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK) +{ POINTonE2_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK, + const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, &BLS12_381_G2, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig, + const POINTonE2 *hash, const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 144) { + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE2_mult_gls(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p2_affine_is_equal(const POINTonE2_affine *a, + const POINTonE2_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p2_is_inf(const POINTonE2 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE2 *blst_p2_generator(void) +{ return &BLS12_381_G2; } + +int blst_p2_affine_is_inf(const POINTonE2_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE2_affine *blst_p2_affine_generator(void) +{ return (const POINTonE2_affine *)&BLS12_381_G2; } + +size_t blst_p2_sizeof(void) +{ return sizeof(POINTonE2); } + +size_t blst_p2_affine_sizeof(void) +{ return sizeof(POINTonE2_affine); } diff --git a/crypto/blst_src/ec_mult.h b/crypto/blst_src/ec_mult.h new file mode 100644 index 00000000000..192f7337cbf --- /dev/null +++ b/crypto/blst_src/ec_mult.h @@ -0,0 +1,289 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_EC_MULT_H__ +#define __BLS12_381_ASM_EC_MULT_H__ + +#include "point.h" + +/* Works up to 9 bits */ +static limb_t get_wval(const byte *d, size_t off, size_t bits) +{ + size_t top = off + bits - 1; + limb_t ret; + + ret = ((limb_t)d[top / 8] << 8) | d[off / 8]; + + return ret >> (off%8); +} + +/* Works up to 25 bits. */ +static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) +{ + size_t i, top = (off + bits - 1)/8; + limb_t ret, mask = (limb_t)0 - 1; + + d += off/8; + top -= off/8-1; + + /* this is not about constant-time-ness, but branch optimization */ + for (ret=0, i=0; i<4;) { + ret |= (*d & mask) << (8*i); + mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1)); + d += 1 & mask; + } + + return ret >> (off%8); +} + +/* + * Window value encoding that utilizes the fact that -P is trivially + * calculated, which allows to halve the size of pre-computed table, + * is attributed to A. D. Booth, hence the name of the subroutines... + */ +static limb_t booth_encode(limb_t wval, size_t sz) +{ + limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ + + wval = (wval + 1) >> 1; + wval = (wval & ~mask) | ((0-wval) & mask); + + /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ + return wval; +} + +/* + * Key feature of these constant-time subroutines is that they tolerate + * zeros in most significant bit positions of the scalar[s], or in other + * words, zero-padded scalar values. This means that one can and should + * pass order's bit-length, which is customarily publicly known, instead + * of the factual scalars' bit-lengths. This is facilitated by point + * addition subroutines implemented to handle points at infinity, which + * are encoded as Z==0. [Doubling agorithms handle such points at + * infinity "naturally," since resulting Z is product of original Z.] + */ +#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ +static void ptype##_gather_booth_w##SZ(ptype *restrict p, \ + const ptype table[1<<(SZ-1)], \ + limb_t booth_idx) \ +{ \ + size_t i; \ + bool_t booth_sign = (booth_idx >> SZ) & 1; \ +\ + booth_idx &= (1< 0) \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + else \ + wval = (scalar[0] << 1) & wmask; \ +\ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(ret, table[0], wval); \ +\ + i = 1; \ + while (bits > 0) { \ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +\ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ + i = 0; scalar_s = scalars; \ + } \ +\ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (scalar[0] << 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} \ +\ +static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \ + const byte *scalar, size_t bits) \ +{ \ + limb_t wmask, wval; \ + size_t j, window; \ + ptype temp[1]; \ + ptype table[1<<(SZ-1)]; \ +\ + ptype##_precompute_w##SZ(table, point); \ +\ + /* top excess bits modulo target window size */ \ + window = bits % SZ; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + bits -= window; \ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(ret, table, wval); \ +\ + while (bits > 0) { \ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ +\ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table, wval); \ + if (bits > 0) ptype##_add(ret, ret, temp); \ + else ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} + +#if 0 +/* ~50%, or ~2x[!] slower than w5... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit, pbit = 0; \ +\ + vec_copy(sum, p, sizeof(ptype)); \ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##_cswap(ret, sum, bit); \ + ptype##_add(sum, sum, ret); \ + ptype##_double(ret, ret); \ + pbit ^= bit; \ + } \ + ptype##_cswap(ret, sum, pbit); \ +} +#else +/* >40% better performance than above, [and ~30% slower than w5]... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *out, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype##xz sum[1]; \ + ptype##xz pxz[1]; \ + ptype##xz ret[1]; \ + bool_t bit, pbit = 0; \ +\ + ptype##xz_ladder_pre(pxz, p); \ + vec_copy(sum, pxz, sizeof(ptype##xz)); \ + vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##xz_cswap(ret, sum, bit); \ + ptype##xz_ladder_step(ret, sum, pxz); \ + pbit ^= bit; \ + } \ + ptype##xz_cswap(ret, sum, pbit); \ + ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ +} +#endif + +/* + * Sole reason for existence of this implementation is that addition + * with affine point renders a share of multiplications redundant by + * virtue of Z==1. And since pre-defined generator point can be and + * customarily is instantiated affine, it would be hardly appropriate + * to pass on this opportunity. Though while it's faster than the + * generic ladder implementation, by ~25%, it's not faster than XZ one + * above, <15% slower. Just in case, it's faster than generic ladder + * even if one accounts for prior conversion to affine coordinates, + * so that choice [for resource-constrained case] is actually between + * this plus said conversion and XZ ladder... + * + * To summarize, if ptype##_mult_w5 executed in one unit of time, then + * - naive ptype##_mult_ladder would execute in ~2; + * - XZ version above - in ~1.4; + * - ptype##_affine_mult_ladder below - in ~1.65; + * - [small-footprint ptype##_to_affine would run in ~0.18]. + * + * Caveat lector, |p_affine|*(order+2) produces wrong result, because + * addition doesn't handle doubling. Indeed, P*(order+1) is P and it + * fails to add with itself producing infinity in last addition. But + * as long as |scalar| is reduced modulo order, as it should be, it's + * not a problem... + */ +#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ +static void ptype##_affine_mult_ladder(ptype *ret, \ + const ptype##_affine *p_affine, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit; \ +\ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + ptype##_double(ret, ret); \ + ptype##_add_affine(sum, ret, p_affine); \ + bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ + ptype##_ccopy(ret, sum, bit); \ + } \ +} +#endif diff --git a/crypto/blst_src/ec_ops.h b/crypto/blst_src/ec_ops.h new file mode 100644 index 00000000000..0d531f816e2 --- /dev/null +++ b/crypto/blst_src/ec_ops.h @@ -0,0 +1,787 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_384_ASM_EC_OPS_H__ +#define __BLS12_384_ASM_EC_OPS_H__ +/* + * Addition that can handle doubling [as well as points at infinity, + * which are encoded as Z==0] in constant time. It naturally comes at + * cost, but this subroutine should be called only when independent + * points are processed, which is considered reasonable compromise. + * For example, ptype##s_mult_w5 calls it, but since *major* gain is + * result of pure doublings being effectively divided by amount of + * points, slightly slower addition can be tolerated. But what is the + * additional cost more specifically? Best addition result is 11M+5S, + * while this routine takes 13M+5S (+1M+1S if a4!=0), as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1*Z2^2 | U1 = X1 + * U2 = X2*Z1^2 | + * S1 = Y1*Z2^3 | S1 = Y1 + * S2 = Y2*Z1^3 | + * zz = Z1*Z2 | zz = Z1 + * H = U2-U1 | H' = 2*Y1 + * R = S2-S1 | R' = 3*X1^2[+a*Z1^4] + * sx = U1+U2 | sx = X1+X1 + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = H*zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_IMPL(ptype, bits, field) \ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4) \ +{ \ + ptype p3; /* starts as (U1, S1, zz) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + add_##field(dbl.sx, p1->X, p1->X); /* sx = X1+X1 */\ + sqr_##field(dbl.R, p1->X); /* X1^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X1^2 */\ + add_##field(dbl.H, p1->Y, p1->Y); /* H = 2*Y1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(p3.X, p2->Z); /* Z2^2 */\ + mul_##field(p3.Z, p1->Z, p2->Z); /* Z1*Z2 */\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ +\ + if (a4 != NULL) { \ + sqr_##field(p3.Y, add.H); /* Z1^4, [borrow p3.Y] */\ + mul_##field(p3.Y, p3.Y, a4); \ + add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\ + } \ +\ + mul_##field(p3.Y, p1->Y, p2->Z); \ + mul_##field(p3.Y, p3.Y, p3.X); /* S1 = Y1*Z2^3 */\ + mul_##field(add.R, p2->Y, p1->Z); \ + mul_##field(add.R, add.R, add.H); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p3.Y); /* R = S2-S1 */\ +\ + mul_##field(p3.X, p3.X, p1->X); /* U1 = X1*Z2^2 */\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p3.X); /* sx = U1+U2 */\ + sub_##field(add.H, add.H, p3.X); /* H = U2-U1 */\ +\ + /* make the choice between addition and doubling */\ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(&p3, p1, &p3, sizeof(p3), is_dbl); \ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + mul_##field(p3.Z, p3.Z, add.H); /* Z3 = H*Z1*Z2 */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * Addition with affine point that can handle doubling [as well as + * points at infinity, with |p1| being encoded as Z==0 and |p2| as + * X,Y==0] in constant time. But at what additional cost? Best + * addition result is 7M+4S, while this routine takes 8M+5S, as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1 | U1 = X2 + * U2 = X2*Z1^2 | + * S1 = Y1 | S1 = Y2 + * S2 = Y2*Z1^3 | + * H = U2-X1 | H' = 2*Y2 + * R = S2-Y1 | R' = 3*X2^2[+a] + * sx = X1+U2 | sx = X2+X2 + * zz = H*Z1 | zz = H' + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; /* starts as (,, H*Z1) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ + add_##field(dbl.sx, p2->X, p2->X); /* sx = X2+X2 */\ + sqr_##field(dbl.R, p2->X); /* X2^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X2^2 */\ + add_##field(dbl.H, p2->Y, p2->Y); /* H = 2*Y2 */\ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ + mul_##field(add.R, add.H, p1->Z); /* Z1^3 */\ + mul_##field(add.R, add.R, p2->Y); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p1->Y); /* R = S2-Y1 */\ +\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p1->X); /* sx = X1+U2 */\ + sub_##field(add.H, add.H, p1->X); /* H = U2-X1 */\ +\ + mul_##field(p3.Z, add.H, p1->Z); /* Z3 = H*Z1 */\ +\ + /* make the choice between addition and doubling */ \ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl); \ + vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl + * with twist to handle either input at infinity, which are encoded as Z==0. + */ +#define POINT_ADD_IMPL(ptype, bits, field) \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(S1, Z2Z2, p2->Z); /* Z2*Z2Z2 */\ + mul_##field(S1, S1, p1->Y); /* S1 = Y1*Z2*Z2Z2 */\ +\ + sub_##field(p3.Z, p3.Z, S1); /* S2-S1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-S1) */\ +\ + mul_##field(U1, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + sub_##field(H, H, U1); /* H = U2-U1 */\ +\ + add_##field(I, H, H); /* 2*H */\ + sqr_##field(I, I); /* I = (2*H)^2 */\ +\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(S1, S1, J); /* S1*J */\ +\ + mul_##field(p3.Y, U1, I); /* V = U1*I */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, S1); \ + sub_##field(p3.Y, p3.Y, S1); /* Y3 = r*(V-X3)-2*S1*J */\ +\ + add_##field(p3.Z, p1->Z, p2->Z); /* Z1+Z2 */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+Z2)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+Z2)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, Z2Z2); /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\ + mul_##field(p3.Z, p3.Z, H); /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + * with twist to handle either input at infinity, with |p1| encoded as Z==0, + * and |p2| as X==Y==0. + */ +#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, H, HH, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ +\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ + sub_##field(H, H, p1->X); /* H = U2-X1 */\ +\ + sqr_##field(HH, H); /* HH = H^2 */\ + add_##field(I, HH, HH); \ + add_##field(I, I, I); /* I = 4*HH */\ +\ + mul_##field(p3.Y, p1->X, I); /* V = X1*I */\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(I, J, p1->Y); /* Y1*J */\ +\ + sub_##field(p3.Z, p3.Z, p1->Y); /* S2-Y1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-Y1) */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, I); \ + sub_##field(p3.Y, p3.Y, I); /* Y3 = r*(V-X3)-2*Y1*J */\ +\ + add_##field(p3.Z, p1->Z, H); /* Z1+H */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+H)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+H)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */\ +\ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + */ +#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \ +static void ptype##_double(ptype *p3, const ptype *p1) \ +{ \ + vec##bits A, B, C; \ +\ + sqr_##field(A, p1->X); /* A = X1^2 */\ + sqr_##field(B, p1->Y); /* B = Y1^2 */\ + sqr_##field(C, B); /* C = B^2 */\ +\ + add_##field(B, B, p1->X); /* X1+B */\ + sqr_##field(B, B); /* (X1+B)^2 */\ + sub_##field(B, B, A); /* (X1+B)^2-A */\ + sub_##field(B, B, C); /* (X1+B)^2-A-C */\ + add_##field(B, B, B); /* D = 2*((X1+B)^2-A-C) */\ +\ + mul_by_3_##field(A, A); /* E = 3*A */\ +\ + sqr_##field(p3->X, A); /* F = E^2 */\ + sub_##field(p3->X, p3->X, B); \ + sub_##field(p3->X, p3->X, B); /* X3 = F-2*D */\ +\ + add_##field(p3->Z, p1->Z, p1->Z); /* 2*Z1 */\ + mul_##field(p3->Z, p3->Z, p1->Y); /* Z3 = 2*Z1*Y1 */\ +\ + mul_by_8_##field(C, C); /* 8*C */\ + sub_##field(p3->Y, B, p3->X); /* D-X3 */\ + mul_##field(p3->Y, p3->Y, A); /* E*(D-X3) */\ + sub_##field(p3->Y, p3->Y, C); /* Y3 = E*(D-X3)-8*C */\ +} + +#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \ +static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \ +{ \ + mul_##field(pxz->X, p->X, p->Z); /* X2 = X1*Z1 */\ + sqr_##field(pxz->Z, p->Z); \ + mul_##field(pxz->Z, pxz->Z, p->Z); /* Z2 = Z1^3 */\ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3 + * with twist to handle either input at infinity, which are encoded as Z==0. + * Just in case, order of doubling and addition is reverse in comparison to + * hyperelliptic.org entry. This was done to minimize temporary storage. + * + * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|. + */ +#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p) \ +{ \ + ptype##xz p5; \ + vec##bits A, B, C, D, XX, ZZ; \ + bool_t r_inf, s_inf; \ + /* s += r */\ + mul_##field(A, r->X, s->X); /* A = X2*X3 */\ + mul_##field(B, r->Z, s->Z); /* B = Z2*Z3 */\ + mul_##field(C, r->X, s->Z); /* C = X2*Z3 */\ + mul_##field(D, r->Z, s->X); /* D = X3*Z2 */\ +\ + sqr_##field(A, A); /* (A[-a*B])^2 */\ + add_##field(p5.X, C, D); /* C+D */\ + mul_##field(p5.X, p5.X, B); /* B*(C+D) */\ + mul_by_4b_##suffix4b(B, p5.X); /* b4*B*(C+D) */\ + sub_##field(p5.X, A, B); /* (A[-a*B])^2-b4*B*(C+D) */\ + mul_##field(p5.X, p5.X, p->Z); /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\ +\ + sub_##field(p5.Z, C, D); /* C-D */\ + sqr_##field(p5.Z, p5.Z); /* (C-D)^2 */\ + mul_##field(p5.Z, p5.Z, p->X); /* Z5 = X1*(C-D)^2 */\ +\ + r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ +\ + vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \ + vec_select(s, s, &p5, sizeof(ptype##xz), r_inf); \ + /* r *= 2 */\ + sqr_##field(XX, r->X); /* XX = X2^2 */\ + sqr_##field(ZZ, r->Z); /* ZZ = Z2^2 */\ +\ + add_##field(r->Z, r->X, r->Z); /* X2+Z2 */\ + sqr_##field(r->Z, r->Z); /* (X2+Z2)^2 */\ + sub_##field(r->Z, r->Z, XX); /* (X2+Z2)^2-XX */\ + sub_##field(r->Z, r->Z, ZZ); /* E = (X2+Z2)^2-XX-ZZ */\ +\ + sqr_##field(A, XX); /* (XX[-a*ZZ])^2 */\ + mul_##field(B, r->Z, ZZ); /* E*ZZ */\ + mul_by_4b_##suffix4b(C, B); /* b4*E*ZZ */\ + sub_##field(r->X, A, C); /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\ +\ + sqr_##field(ZZ, ZZ); /* ZZ^2 */\ + mul_by_4b_##suffix4b(B, ZZ); /* b4*ZZ^2 */\ + mul_##field(r->Z, r->Z, XX); /* E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, r->Z); /* 2*E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, B); /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\ +} + +/* + * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye, + * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist + * and conversion to Jacobian coordinates from /.../ecp_smpl.c, + * and with twist to recover from |s| at infinity [which occurs when + * multiplying by (order-1)]. + * + * X4 = 2*Y1*X2*Z3*Z1*Z2 + * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2 + * Z4 = 2*Y1*Z3*Z2^2*Z1 + * + * Z3x2 = 2*Z3 + * Y1Z3x2 = Y1*Z3x2 + * Z1Z2 = Z1*Z2 + * X1Z2 = X1*Z2 + * X2Z1 = X2*Z1 + * X4 = Y1Z3x2*X2*Z1Z2 + * A = b*Z3x2*(Z1Z2)^2 + * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1) + * C = X3*(X1Z2-X2Z1)^2 + * Y4 = A+B-C + * Z4 = Y1Z3x2*Z1Z2*Z2 + * + * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0. + */ +#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##xz_ladder_post(ptype *p4, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1) \ +{ \ + vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \ + bool_t s_inf; \ +\ + add_##field(Z3x2, s->Z, s->Z); /* Z3x2 = 2*Z3 */\ + mul_##field(Y1Z3x2, Y1, Z3x2); /* Y1Z3x2 = Y1*Z3x2 */\ + mul_##field(Z1Z2, p->Z, r->Z); /* Z1Z2 = Z1*Z2 */\ + mul_##field(X1Z2, p->X, r->Z); /* X1Z2 = X1*Z2 */\ + mul_##field(X2Z1, r->X, p->Z); /* X2Z1 = X2*Z1 */\ +\ + mul_##field(p4->X, Y1Z3x2, r->X); /* Y1Z3x2*X2 */\ + mul_##field(p4->X, p4->X, Z1Z2); /* X4 = Y1Z3x2*X2*Z1Z2 */\ +\ + sqr_##field(A, Z1Z2); /* (Z1Z2)^2 */\ + mul_##field(B, A, Z3x2); /* Z3x2*(Z1Z2)^2 */\ + mul_by_b_##suffixb(A, B); /* A = b*Z3x2*(Z1Z2)^2 */\ +\ + mul_##field(B, p->X, r->X); /* [a*Z1Z2+]X1*X2 */\ + mul_##field(B, B, s->Z); /* Z3*([a*Z1Z2+]X1*X2) */\ + add_##field(C, X1Z2, X2Z1); /* X1Z2+X2Z1 */\ + mul_##field(B, B, C); /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\ +\ + sub_##field(C, X1Z2, X2Z1); /* X1Z2-X2Z1 */\ + sqr_##field(C, C); /* (X1Z2-X2Z1)^2 */\ + mul_##field(C, C, s->X); /* C = X3*(X1Z2-X2Z1)^2 */\ +\ + add_##field(A, A, B); /* A+B */\ + sub_##field(A, A, C); /* Y4 = A+B-C */\ +\ + mul_##field(p4->Z, Z1Z2, r->Z); /* Z1Z2*Z2 */\ + mul_##field(p4->Z, p4->Z, Y1Z3x2); /* Y1Z3x2*Z1Z2*Z2 */\ +\ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ + vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \ + vec_select(p4->Y, Y1, A, sizeof(p4->Y), s_inf); \ + vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \ + ptype##_cneg(p4, s_inf); \ + /* to Jacobian */\ + mul_##field(p4->X, p4->X, p4->Z); /* X4 = X4*Z4 */\ + sqr_##field(B, p4->Z); \ + mul_##field(p4->Y, p4->Y, B); /* Y4 = Y4*Z4^2 */\ +} + +#define POINT_IS_EQUAL_IMPL(ptype, bits, field) \ +static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \ +{ \ + vec##bits Z1Z1, Z2Z2; \ + ptype##_affine a1, a2; \ + bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(a1.X, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(a2.X, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + mul_##field(a1.Y, p1->Y, p2->Z); /* Y1*Z2 */\ + mul_##field(a2.Y, p2->Y, p1->Z); /* Y2*Z1 */\ +\ + mul_##field(a1.Y, a1.Y, Z2Z2); /* S1 = Y1*Z2*Z2Z2 */\ + mul_##field(a2.Y, a2.Y, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ +\ + return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle + * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5| + * and replacing few first references to |X3| in the formula, up to step + * 21, with it. 12M[+27A], doubling and infinity are handled by the + * formula itself. Infinity is to be encoded as [0, !0, 0]. + */ +#define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \ + const ptype##proj *p2) \ +{ \ + vec##bits t0, t1, t2, t3, t4, t5; \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + mul_##field(t2, p1->Z, p2->Z); /* 3. t2 = Z1*Z2 */\ + add_##field(t3, p1->X, p1->Y); /* 4. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 5. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 6. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 7. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 8. t3 = t3-t4 */\ + add_##field(t4, p1->Y, p1->Z); /* 9. t4 = Y1+Z1 */\ + add_##field(t5, p2->Y, p2->Z); /* 10. t5 = Y2+Z2 */\ + mul_##field(t4, t4, t5); /* 11. t4 = t4*t5 */\ + add_##field(t5, t1, t2); /* 12. t5 = t1+t2 */\ + sub_##field(t4, t4, t5); /* 13. t4 = t4-t5 */\ + add_##field(t5, p1->X, p1->Z); /* 14. t5 = X1+Z1 */\ + add_##field(p3->Y, p2->X, p2->Z); /* 15. Y3 = X2+Z2 */\ + mul_##field(t5, t5, p3->Y); /* 16. t5 = t5*Y3 */\ + add_##field(p3->Y, t0, t2); /* 17. Y3 = t0+t2 */\ + sub_##field(p3->Y, t5, p3->Y); /* 18. Y3 = t5-Y3 */\ + mul_by_3_##field(t0, t0); /* 19-20. t0 = 3*t0 */\ + mul_by_3_##field(t5, t2); /* 21. t5 = 3*t2 */\ + mul_by_b_##suffixb(t2, t5); /* 21. t2 = b*t5 */\ + add_##field(p3->Z, t1, t2); /* 22. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 23. t1 = t1-t2 */\ + mul_by_3_##field(t5, p3->Y); /* 24. t5 = 3*Y3 */\ + mul_by_b_##suffixb(p3->Y, t5); /* 24. Y3 = b*t5 */\ + mul_##field(p3->X, t4, p3->Y); /* 25. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 26. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 27. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 28. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 29. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 30. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 31. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 32. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 33. Z3 = Z3+t0 */\ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle + * |p2| being infinity encoded as [0, 0]. 11M[+21A]. + */ +#define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype##proj p3[1]; \ + vec##bits t0, t1, t2, t3, t4; \ + limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + add_##field(t3, p1->X, p1->Y); /* 3. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 4. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 5. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 6. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 7. t3 = t3-t4 */\ + mul_##field(t4, p2->Y, p1->Z); /* 8. t4 = Y2*Z1 */\ + add_##field(t4, t4, p1->Y); /* 9. t4 = t4+Y1 */\ + mul_##field(p3->Y, p2->X, p1->Z); /* 10. Y3 = X2*Z1 */\ + add_##field(p3->Y, p3->Y, p1->X); /* 11. Y3 = Y3+X1 */\ + mul_by_3_##field(t0, t0); /* 12-13. t0 = 3*t0 */\ + mul_by_b_##suffixb(t2, p1->Z); /* 14. t2 = b*Z1 */\ + mul_by_3_##field(t2, t2); /* 14. t2 = 3*t2 */\ + add_##field(p3->Z, t1, t2); /* 15. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 16. t1 = t1-t2 */\ + mul_by_b_##suffixb(t2, p3->Y); /* 17. t2 = b*Y3 */\ + mul_by_3_##field(p3->Y, t2); /* 17. Y3 = 3*t2 */\ + mul_##field(p3->X, t4, p3->Y); /* 18. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 19. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 20. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 21. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 22. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 23. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 24. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 25. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 26. Z3 = Z3+t0 */\ +\ + vec_select(out, p1, p3, sizeof(*out), p2inf); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle + * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y + * and reordering operations to bring references to |p1| forward. + * 6M+2S[+13A]. + */ +#define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \ +{ \ + vec##bits t0, t1, t2, t3; \ +\ + sqr_##field(t0, p1->Y); /* 1. t0 = Y*Y */\ + mul_##field(t1, p1->Y, p1->Z); /* 5. t1 = Y*Z */\ + sqr_##field(t2, p1->Z); /* 6. t2 = Z*Z */\ + mul_##field(t3, p1->X, p1->Y); /* 16. t3 = X*Y */\ + lshift_##field(p3->Z, t0, 3); /* 2-4. Z3 = 8*t0 */\ + mul_by_b_##suffixb(p3->X, t2); /* 7. t2 = b*t2 */\ + mul_by_3_##field(t2, p3->X); /* 7. t2 = 3*t2 */\ + mul_##field(p3->X, t2, p3->Z); /* 8. X3 = t2*Z3 */\ + add_##field(p3->Y, t0, t2); /* 9. Y3 = t0+t2 */\ + mul_##field(p3->Z, t1, p3->Z); /* 10. Z3 = t1*Z3 */\ + mul_by_3_##field(t2, t2); /* 11-12. t2 = 3*t2 */\ + sub_##field(t0, t0, t2); /* 13. t0 = t0-t2 */\ + mul_##field(p3->Y, t0, p3->Y); /* 14. Y3 = t0*Y3 */\ + add_##field(p3->Y, p3->X, p3->Y); /* 15. Y3 = X3+Y3 */\ + mul_##field(p3->X, t0, t3); /* 17. X3 = t0*t3 */\ + add_##field(p3->X, p3->X, p3->X); /* 18. X3 = X3+X3 */\ +} + +#define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \ +{ \ + vec##bits ZZ; \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + mul_##field(out->Y, in->Y, ZZ); \ + vec_copy(out->Z, in->Z, sizeof(out->Z)); \ +} + +#define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \ +static void ptype##_to_projective(ptype##proj *out, const ptype *in) \ +{ \ + vec##bits ZZ; \ + limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \ + mul_##field(out->Z, ZZ, in->Z); \ +} + +/******************* !!!!! NOT CONSTANT TIME !!!!! *******************/ + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 + * with twist to handle either input at infinity. Addition costs 12M+2S, + * while conditional doubling - 4M+6M+3S. + */ +#define POINTXYZZ_DADD_IMPL(ptype, bits, field) \ +static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##xyzz *p2) \ +{ \ + vec##bits U, S, P, R; \ +\ + if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3, p2, sizeof(*p3)); \ + return; \ + } \ +\ + mul_##field(U, p1->X, p2->ZZ); /* U1 = X1*ZZ2 */\ + mul_##field(S, p1->Y, p2->ZZZ); /* S1 = Y1*ZZZ2 */\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + sub_##field(P, P, U); /* P = U2-U1 */\ + sub_##field(R, R, S); /* R = S2-S1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p1| and |p2| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, U, PP); /* Q = U1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, S, PPP); /* S1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-S1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, p2->ZZ); /* ZZ1*ZZ2 */\ + mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\ + mul_##field(p3->ZZ, p3->ZZ, PP); /* ZZ3 = ZZ1*ZZ2*PP */\ + mul_##field(p3->ZZZ, p3->ZZZ, PPP); /* ZZZ3 = ZZZ1*ZZZ2*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits V, W, M; /* double |p1| */\ +\ + add_##field(U, p1->Y, p1->Y); /* U = 2*Y1 */\ + sqr_##field(V, U); /* V = U^2 */\ + mul_##field(W, V, U); /* W = U*V */\ + mul_##field(S, p1->X, V); /* S = X1*V */\ + sqr_##field(M, p1->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a*ZZ1^2] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, W, p1->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + mul_##field(p3->ZZ, p1->ZZ, V); /* ZZ3 = V*ZZ1 */\ + mul_##field(p3->ZZZ, p1->ZZZ, W); /* ZZ3 = W*ZZZ1 */\ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1 + * with twists to handle even subtractions and either input at infinity. + * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S. + */ +#define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##_affine *p2, \ + bool_t subtract) \ +{ \ + vec##bits P, R; \ +\ + if (vec_is_zero(p2, sizeof(*p2))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\ + cneg_##field(p3->ZZZ, one, subtract); \ + vec_copy(p3->ZZ, one, sizeof(p3->ZZ)); \ + return; \ + } \ +\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + cneg_##field(R, R, subtract); \ + sub_##field(P, P, p1->X); /* P = U2-X1 */\ + sub_##field(R, R, p1->Y); /* R = S2-Y1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p2| to |p1| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, p1->X, PP); /* Q = X1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, p1->Y, PPP); /* Y1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-Y1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, PP); /* ZZ3 = ZZ1*PP */\ + mul_##field(p3->ZZZ, p1->ZZZ, PPP); /* ZZZ3 = ZZZ1*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits U, S, M; /* double |p2| */\ +\ + add_##field(U, p2->Y, p2->Y); /* U = 2*Y1 */\ + sqr_##field(p3->ZZ, U); /* [ZZ3 =] V = U^2 */\ + mul_##field(p3->ZZZ, p3->ZZ, U); /* [ZZZ3 =] W = U*V */\ + mul_##field(S, p2->X, p3->ZZ); /* S = X1*V */\ + sqr_##field(M, p2->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, p3->ZZZ, p2->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +#define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \ +{ \ + mul_##field(out->X, in->X, in->ZZ); \ + mul_##field(out->Y, in->Y, in->ZZZ); \ + vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \ +} + +#define POINT_TO_XYZZ_IMPL(ptype, bits, field) \ +static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \ +{ \ + vec_copy(out->X, in->X, 2*sizeof(out->X)); \ + sqr_##field(out->ZZ, in->Z); \ + mul_##field(out->ZZZ, out->ZZ, in->Z); \ +} + +#endif diff --git a/crypto/blst_src/errors.h b/crypto/blst_src/errors.h new file mode 100644 index 00000000000..425daeb486f --- /dev/null +++ b/crypto/blst_src/errors.h @@ -0,0 +1,19 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_ERRORS_H__ +#define __BLS12_381_ASM_ERRORS_H__ + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, +} BLST_ERROR; + +#endif diff --git a/crypto/blst_src/exp.c b/crypto/blst_src/exp.c new file mode 100644 index 00000000000..55c5c5a7875 --- /dev/null +++ b/crypto/blst_src/exp.c @@ -0,0 +1,55 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +/* + * |out| = |inp|^|pow|, small footprint, public exponent + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ +#if 1 + vec384 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_mont_384(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +#else + unsigned int i; + vec384 sqr; + + vec_copy(sqr, inp, sizeof(sqr)); + for (i = 0; !is_bit_set(pow, i++);) + sqr_mont_384(sqr, sqr, sqr, p, n0); + vec_copy(out, sqr, sizeof(sqr)); + for (; i < pow_bits; i++) { + sqr_mont_384(sqr, sqr, sqr, p, n0); + if (is_bit_set(pow, i)) + mul_mont_384(out, out, sqr, p, n0); + } +#endif +} + +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ + vec384x ret; + + vec_copy(ret, inp, sizeof(ret)); /* |ret| = |inp|^1 */ + --pow_bits; /* most significant bit is accounted for, skip over */ + while (pow_bits--) { + sqr_mont_384x(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384x(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* |out| = |ret| */ +} diff --git a/crypto/blst_src/exports.c b/crypto/blst_src/exports.c new file mode 100644 index 00000000000..ad720999883 --- /dev/null +++ b/crypto/blst_src/exports.c @@ -0,0 +1,559 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * Why this file? Overall goal is to ensure that all internal calls + * remain internal after linking application. This is to both + * + * a) minimize possibility of external name conflicts (since all + * non-blst-prefixed and [assembly subroutines] remain static); + * b) preclude possibility of unintentional internal reference + * overload in shared library context (one can achieve same + * effect with -Bsymbolic, but we don't want to rely on end-user + * to remember to use it); + */ + +#include "fields.h" +#include "bytes.h" + +/* + * BLS12-381-specifc Fr shortcuts to assembly. + */ +void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) +{ add_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b) +{ sub_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_mul_by_3(vec256 ret, const vec256 a) +{ mul_by_3_mod_256(ret, a, BLS12_381_r); } + +void blst_fr_lshift(vec256 ret, const vec256 a, size_t count) +{ lshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) +{ rshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +void blst_fr_sqr(vec256 ret, const vec256 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_cneg(vec256 ret, const vec256 a, int flag) +{ cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r); } + +void blst_fr_to(vec256 ret, const vec256 a) +{ mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0); } + +void blst_fr_from(vec256 ret, const vec256 a) +{ from_mont_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_from_scalar(vec256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0); + vec_zero(out, sizeof(out)); + } +} + +void blst_scalar_from_fr(pow256 ret, const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + from_mont_256(out, a, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_fr_check(const pow256 a) +{ return (int)(check_mod_256(a, BLS12_381_r) | + bytes_are_zero(a, sizeof(pow256))); +} + +int blst_sk_check(const pow256 a) +{ return (int)check_mod_256(a, BLS12_381_r); } + +int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b) +{ + vec256 a_fr, b_fr; + const union { + long one; + char little; + } is_endian = { 1 }; + + if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) { + limbs_from_le_bytes(a_fr, a, sizeof(a_fr)); + limbs_from_le_bytes(b_fr, b, sizeof(a_fr)); + a = (const byte *)a_fr; + b = (const byte *)b_fr; + } + mul_mont_sparse_256(a_fr, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + mul_mont_sparse_256(b_fr, (const limb_t *)b, BLS12_381_rRR, + BLS12_381_r, r0); + mul_mont_sparse_256(a_fr, a_fr, b_fr, BLS12_381_r, r0); + from_mont_256(a_fr, a_fr, BLS12_381_r, r0); + le_bytes_from_limbs(ret, a_fr, sizeof(a_fr)); + + return (int)(vec_is_zero(a_fr, sizeof(a_fr)) ^ 1); +} + +void blst_sk_inverse(pow256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) { + limb_t *out = (limb_t *)ret; + mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +/* + * BLS12-381-specifc Fp shortcuts to assembly. + */ +void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) +{ add_fp(ret, a, b); } + +void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b) +{ sub_fp(ret, a, b); } + +void blst_fp_mul_by_3(vec384 ret, const vec384 a) +{ mul_by_3_fp(ret, a); } + +void blst_fp_mul_by_8(vec384 ret, const vec384 a) +{ mul_by_8_fp(ret, a); } + +void blst_fp_lshift(vec384 ret, const vec384 a, size_t count) +{ lshift_fp(ret, a, count); } + +void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b) +{ mul_fp(ret, a, b); } + +void blst_fp_sqr(vec384 ret, const vec384 a) +{ sqr_fp(ret, a); } + +void blst_fp_cneg(vec384 ret, const vec384 a, int flag) +{ cneg_fp(ret, a, is_zero(flag) ^ 1); } + +void blst_fp_to(vec384 ret, const vec384 a) +{ mul_fp(ret, a, BLS12_381_RR); } + +void blst_fp_from(vec384 ret, const vec384 a) +{ from_fp(ret, a); } + +/* + * Fp serialization/deserialization. + */ +void blst_fp_from_uint32(vec384 ret, const unsigned int a[12]) +{ + if (sizeof(limb_t) == 8) { + int i; + for (i = 0; i < 6; i++) + ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1))); + a = (const unsigned int *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint32_from_fp(unsigned int ret[12], const vec384 a) +{ + if (sizeof(limb_t) == 4) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) { + limb_t limb = out[i]; + ret[2*i] = (unsigned int)limb; + ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1))); + } + } +} + +void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 6; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + } +} + +void blst_fp_from_bendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_be_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_bendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + be_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +void blst_fp_from_lendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_le_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + le_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +/* + * BLS12-381-specifc Fp2 shortcuts to assembly. + */ +void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) +{ add_fp2(ret, a, b); } + +void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b) +{ sub_fp2(ret, a, b); } + +void blst_fp2_mul_by_3(vec384x ret, const vec384x a) +{ mul_by_3_fp2(ret, a); } + +void blst_fp2_mul_by_8(vec384x ret, const vec384x a) +{ mul_by_8_fp2(ret, a); } + +void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count) +{ lshift_fp2(ret, a, count); } + +void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b) +{ mul_fp2(ret, a, b); } + +void blst_fp2_sqr(vec384x ret, const vec384x a) +{ sqr_fp2(ret, a); } + +void blst_fp2_cneg(vec384x ret, const vec384x a, int flag) +{ cneg_fp2(ret, a, is_zero(flag) ^ 1); } + +/* + * Scalar serialization/deseriazation + */ +void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + } +} + +void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = (unsigned int)(*a++); + w |= (unsigned int)(*a++) << 8; + w |= (unsigned int)(*a++) << 16; + w |= (unsigned int)(*a++) << 24; + ret[i] = w; + } +} + +void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + *ret++ = (byte)(w >> 32); + *ret++ = (byte)(w >> 40); + *ret++ = (byte)(w >> 48); + *ret++ = (byte)(w >> 56); + } +} + +void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = (unsigned long long)(*a++); + w |= (unsigned long long)(*a++) << 8; + w |= (unsigned long long)(*a++) << 16; + w |= (unsigned long long)(*a++) << 24; + w |= (unsigned long long)(*a++) << 32; + w |= (unsigned long long)(*a++) << 40; + w |= (unsigned long long)(*a++) << 48; + w |= (unsigned long long)(*a++) << 56; + ret[i] = w; + } +} + +void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32]) +{ + vec256 out; + limbs_from_be_bytes(out, a, sizeof(out)); + le_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + vec256 out; + limbs_from_le_bytes(out, a, sizeof(out)); + be_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32]) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 4; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + int i; + + from_mont_256(out, a, BLS12_381_r, r0); + for (i = 0; i < 4; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + struct { vec256 out, digit, radix; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix)); + + while (n > 32) { + limbs_from_le_bytes(t.digit, bytes, 32); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0); + bytes += 32; + n -= 32; + } + + vec_zero(t.digit, sizeof(t.digit)); + limbs_from_le_bytes(t.digit, bytes, n); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(t.out, 2*sizeof(t.out)); + + return (int)(ret^1); +} + +int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + struct { vec256 out, digit, radix; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix)); + + bytes += n; + while (n > 32) { + limbs_from_be_bytes(t.digit, bytes -= 32, 32); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0); + n -= 32; + } + + vec_zero(t.digit, sizeof(t.digit)); + limbs_from_be_bytes(t.digit, bytes -= n, n); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(t.out, 2*sizeof(t.out)); + + return (int)(ret^1); +} + +/* + * Test facilitator + */ +void blst_scalar_from_hexascii(pow256 ret, const char *hex) +{ bytes_from_hexascii(ret, sizeof(pow256), hex); } + +void blst_fr_from_hexascii(vec256 ret, const char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec256), hex); + mul_mont_sparse_256(ret, ret, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_fp_from_hexascii(vec384 ret, const char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec384), hex); + mul_fp(ret, ret, BLS12_381_RR); +} diff --git a/crypto/blst_src/fields.h b/crypto/blst_src/fields.h new file mode 100644 index 00000000000..515219f62dd --- /dev/null +++ b/crypto/blst_src/fields.h @@ -0,0 +1,116 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_FIELDS_H__ +#define __BLS12_381_ASM_FIELDS_H__ + +#include "vect.h" +#include "consts.h" + +/* + * BLS12-381-specifc Fp shortcuts to assembly. + */ +static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) +{ add_mod_384(ret, a, b, BLS12_381_P); } + +static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b) +{ sub_mod_384(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp(vec384 ret, const vec384 a) +{ mul_by_3_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp(vec384 ret, const vec384 a) +{ mul_by_8_mod_384(ret, a, BLS12_381_P); } + +static inline void lshift_fp(vec384 ret, const vec384 a, size_t count) +{ lshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void rshift_fp(vec384 ret, const vec384 a, size_t count) +{ rshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void div_by_2_fp(vec384 ret, const vec384 a) +{ div_by_2_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b) +{ mul_mont_384(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp(vec384 ret, const vec384 a) +{ sqr_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag) +{ cneg_mod_384(ret, a, flag, BLS12_381_P); } + +static inline void from_fp(vec384 ret, const vec384 a) +{ from_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void redc_fp(vec384 ret, const vec768 a) +{ redc_mont_384(ret, a, BLS12_381_P, p0); } + +/* + * BLS12-381-specifc Fp2 shortcuts to assembly. + */ +static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) +{ add_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) +{ sub_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp2(vec384x ret, const vec384x a) +{ mul_by_3_mod_384x(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp2(vec384x ret, const vec384x a) +{ mul_by_8_mod_384x(ret, a, BLS12_381_P); } + +static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) +{ + lshift_mod_384(ret[0], a[0], count, BLS12_381_P); + lshift_mod_384(ret[1], a[1], count, BLS12_381_P); +} + +static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) +{ mul_mont_384x(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp2(vec384x ret, const vec384x a) +{ sqr_mont_384x(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag) +{ + cneg_mod_384(ret[0], a[0], flag, BLS12_381_P); + cneg_mod_384(ret[1], a[1], flag, BLS12_381_P); +} + +#define vec_load_global vec_copy + +static void reciprocal_fp(vec384 out, const vec384 inp); +static void flt_reciprocal_fp(vec384 out, const vec384 inp); +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp); +static bool_t sqrt_fp(vec384 out, const vec384 inp); + +static void reciprocal_fp2(vec384x out, const vec384x inp); +static void flt_reciprocal_fp2(vec384x out, const vec384x inp); +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, const vec384x magic_ZZZ); +static bool_t sqrt_fp2(vec384x out, const vec384x inp); +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp); + +typedef vec384x vec384fp2; +typedef vec384fp2 vec384fp6[3]; +typedef vec384fp6 vec384fp12[2]; + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b); +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0); +static void conjugate_fp12(vec384fp12 a); +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n); + +#define neg_fp(r,a) cneg_fp((r),(a),1) +#define neg_fp2(r,a) cneg_fp2((r),(a),1) + +#endif /* __BLS12_381_ASM_FIELDS_H__ */ diff --git a/crypto/blst_src/fp12_tower.c b/crypto/blst_src/fp12_tower.c new file mode 100644 index 00000000000..ab247a8ebf0 --- /dev/null +++ b/crypto/blst_src/fp12_tower.c @@ -0,0 +1,789 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +/* + * Fp2 = Fp[u] / (u^2 + 1) + * Fp6 = Fp2[v] / (v^3 - u - 1) + * Fp12 = Fp6[w] / (w^2 - v) + */ + +static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a) +{ mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P); } + +#if 1 && !defined(__BLST_NO_ASM__) +#define __FP2x2__ +/* + * Fp2x2 is a "widened" version of Fp2, which allows to consolidate + * reductions from several multiplications. In other words instead of + * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter + * addition is double-width... To be more specific this gives ~7-10% + * faster pairing depending on platform... + */ +typedef vec768 vec768x[2]; + +static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a) +{ + /* caveat lector! |ret| may not be same as |a| */ + sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P); + add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P); +} + +static inline void redc_fp2x2(vec384x ret, const vec768x a) +{ + redc_mont_384(ret[0], a[0], BLS12_381_P, p0); + redc_mont_384(ret[1], a[1], BLS12_381_P, p0); +} + +static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b) +{ +#if 1 + mul_382x(ret, a, b, BLS12_381_P); /* +~6% in Miller loop */ +#else + union { vec384 x[2]; vec768 x2; } t; + + add_mod_384(t.x[0], a[0], a[1], BLS12_381_P); + add_mod_384(t.x[1], b[0], b[1], BLS12_381_P); + mul_384(ret[1], t.x[0], t.x[1]); + + mul_384(ret[0], a[0], b[0]); + mul_384(t.x2, a[1], b[1]); + + sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P); + sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P); + + sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P); +#endif +} + +static void sqr_fp2x2(vec768x ret, const vec384x a) +{ +#if 1 + sqr_382x(ret, a, BLS12_381_P); /* +~5% in final exponentiation */ +#else + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], BLS12_381_P); + sub_mod_384(t1, a[0], a[1], BLS12_381_P); + + mul_384(ret[1], a[0], a[1]); + add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P); + + mul_384(ret[0], t0, t1); +#endif +} +#endif /* __FP2x2__ */ + +/* + * Fp6 extension + */ +#if defined(__FP2x2__) /* ~10-13% improvement for mul_fp12 and sqr_fp12 */ +typedef vec768x vec768fp6[3]; + +static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a, + const vec768fp6 b) +{ + sub_fp2x2(ret[0], a[0], b[0]); + sub_fp2x2(ret[1], a[1], b[1]); + sub_fp2x2(ret[2], a[2], b[2]); +} + +static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768x t0, t1, t2; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + mul_fp2x2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(aa, a[1], a[2]); + add_fp2(bb, b[1], b[2]); + mul_fp2x2(ret[0], aa, bb); + sub_fp2x2(ret[0], ret[0], t1); + sub_fp2x2(ret[0], ret[0], t2); + mul_by_u_plus_1_fp2x2(ret[1], ret[0]); /* borrow ret[1] for a moment */ + add_fp2x2(ret[0], ret[1], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2x2(ret[2], t2); /* borrow ret[2] for a moment */ + add_fp2x2(ret[1], ret[1], ret[2]); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(aa, a[0], a[2]); + add_fp2(bb, b[0], b[2]); + mul_fp2x2(ret[2], aa, bb); + sub_fp2x2(ret[2], ret[2], t0); + sub_fp2x2(ret[2], ret[2], t2); + add_fp2x2(ret[2], ret[2], t1); +} + +static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a) +{ + redc_fp2x2(ret[0], a[0]); + redc_fp2x2(ret[1], a[1]); + redc_fp2x2(ret[2], a[2]); +} + +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768fp6 r; + + mul_fp6x2(r, a, b); + redc_fp6x2(ret, r); /* narrow to normal width */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec768x s0, m01, m12, s2, rx; + + sqr_fp2x2(s0, a[0]); + + mul_fp2x2(m01, a[0], a[1]); + add_fp2x2(m01, m01, m01); + + mul_fp2x2(m12, a[1], a[2]); + add_fp2x2(m12, m12, m12); + + sqr_fp2x2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2x2(rx, ret[2]); + sub_fp2x2(rx, rx, s0); + sub_fp2x2(rx, rx, s2); + sub_fp2x2(rx, rx, m01); + sub_fp2x2(rx, rx, m12); + redc_fp2x2(ret[2], rx); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2x2(rx, m12); + add_fp2x2(rx, rx, s0); + redc_fp2x2(ret[0], rx); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2x2(rx, s2); + add_fp2x2(rx, rx, m01); + redc_fp2x2(ret[1], rx); +} +#else +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, t2, t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + mul_fp2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(t4, a[1], a[2]); + add_fp2(t5, b[1], b[2]); + mul_fp2(t3, t4, t5); + sub_fp2(t3, t3, t1); + sub_fp2(t3, t3, t2); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2(t4, t2); + add_fp2(ret[1], ret[1], t4); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(t4, a[0], a[2]); + add_fp2(t5, b[0], b[2]); + mul_fp2(ret[2], t4, t5); + sub_fp2(ret[2], ret[2], t0); + sub_fp2(ret[2], ret[2], t2); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x s0, m01, m12, s2; + + sqr_fp2(s0, a[0]); + + mul_fp2(m01, a[0], a[1]); + add_fp2(m01, m01, m01); + + mul_fp2(m12, a[1], a[2]); + add_fp2(m12, m12, m12); + + sqr_fp2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2(ret[2], ret[2]); + sub_fp2(ret[2], ret[2], s0); + sub_fp2(ret[2], ret[2], s2); + sub_fp2(ret[2], ret[2], m01); + sub_fp2(ret[2], ret[2], m12); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2(ret[0], m12); + add_fp2(ret[0], ret[0], s0); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2(ret[1], s2); + add_fp2(ret[1], ret[1], m01); +} +#endif + +static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + add_fp2(ret[0], a[0], b[0]); + add_fp2(ret[1], a[1], b[1]); + add_fp2(ret[2], a[2], b[2]); +} + +static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + sub_fp2(ret[0], a[0], b[0]); + sub_fp2(ret[1], a[1], b[1]); + sub_fp2(ret[2], a[2], b[2]); +} + +static void neg_fp6(vec384fp6 ret, const vec384fp6 a) +{ + neg_fp2(ret[0], a[0]); + neg_fp2(ret[1], a[1]); + neg_fp2(ret[2], a[2]); +} + +#if 0 +#define mul_by_v_fp6 mul_by_v_fp6 +static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x t; + + mul_by_u_plus_1_fp2(t, a[2]); + vec_copy(ret[2], a[1], sizeof(a[1])); + vec_copy(ret[1], a[0], sizeof(a[0])); + vec_copy(ret[0], t, sizeof(t)); +} +#endif + +/* + * Fp12 extension + */ +#if defined(__FP2x2__) +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec768fp6 t0, t1, rx; + vec384fp6 t2; + + mul_fp6x2(t0, a[0], b[0]); + mul_fp6x2(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6x2(rx, ret[1], t2); + sub_fp6x2(rx, rx, t0); + sub_fp6x2(rx, rx, t1); + redc_fp6x2(ret[1], rx); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rx[0], t1[2]); + add_fp2x2(rx[0], t0[0], rx[0]); + add_fp2x2(rx[1], t0[1], t1[0]); + add_fp2x2(rx[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rx); +} + +static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + mul_fp2x2(ret[1], a[2], b); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + mul_fp2x2(ret[1], a[0], b); + mul_fp2x2(ret[2], a[1], b); +} + +static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp6 b) +{ + vec768x t0, t1; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2x2(ret[1], a[2], b[1]); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + add_fp2x2(ret[0], ret[0], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2x2(ret[2], a[2], b[0]); + add_fp2x2(ret[2], ret[2], t1); +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec768fp6 t0, t1, rr; + vec384fp6 t2; + + mul_by_xy0_fp6x2(t0, a[0], xy00z0); + mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6x2(rr, ret[1], t2); + sub_fp6x2(rr, rr, t0); + sub_fp6x2(rr, rr, t1); + redc_fp6x2(ret[1], rr); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rr[0], t1[2]); + add_fp2x2(rr[0], t0[0], rr[0]); + add_fp2x2(rr[1], t0[1], t1[0]); + add_fp2x2(rr[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rr); +} +#else +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec384fp6 t0, t1, t2; + + mul_fp6(t0, a[0], b[0]); + mul_fp6(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} + +static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + vec384x t; + + mul_fp2(t, a[2], b); + mul_fp2(ret[2], a[1], b); + mul_fp2(ret[1], a[0], b); + mul_by_u_plus_1_fp2(ret[0], t); +} + +static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, /*t2,*/ t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2(t3, a[2], b[1]); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2(ret[2], a[2], b[0]); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec384fp6 t0, t1, t2; + + mul_by_xy0_fp6(t0, a[0], xy00z0); + mul_by_0y0_fp6(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} +#endif + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + add_fp6(t0, a[0], a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, a[1]); + add_fp6(t1, a[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], a[1][2]); + add_fp2(t1[0], a[0][0], t1[2]); + add_fp2(t1[1], a[0][1], a[1][0]); + add_fp2(t1[2], a[0][2], a[1][1]); +#endif + mul_fp6(t0, t0, t1); + mul_fp6(t1, a[0], a[1]); + + /* ret[1] = 2*(a0*a1) */ + add_fp6(ret[1], t1, t1); + + /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v + = a0^2 + a1^2*v */ + sub_fp6(ret[0], t0, t1); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(ret[0], ret[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(ret[0][0], ret[0][0], t1[2]); + sub_fp2(ret[0][1], ret[0][1], t1[0]); + sub_fp2(ret[0][2], ret[0][2], t1[1]); +#endif +} + +static void conjugate_fp12(vec384fp12 a) +{ neg_fp6(a[1], a[1]); } + +static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x c0, c1, c2, t0, t1; + + /* c0 = a0^2 - (a1*a2)*(u+1) */ + sqr_fp2(c0, a[0]); + mul_fp2(t0, a[1], a[2]); + mul_by_u_plus_1_fp2(t0, t0); + sub_fp2(c0, c0, t0); + + /* c1 = a2^2*(u+1) - (a0*a1) */ + sqr_fp2(c1, a[2]); + mul_by_u_plus_1_fp2(c1, c1); + mul_fp2(t0, a[0], a[1]); + sub_fp2(c1, c1, t0); + + /* c2 = a1^2 - a0*a2 */ + sqr_fp2(c2, a[1]); + mul_fp2(t0, a[0], a[2]); + sub_fp2(c2, c2, t0); + + /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */ + mul_fp2(t0, c1, a[2]); + mul_fp2(t1, c2, a[1]); + add_fp2(t0, t0, t1); + mul_by_u_plus_1_fp2(t0, t0); + mul_fp2(t1, c0, a[0]); + add_fp2(t0, t0, t1); + + reciprocal_fp2(t1, t0); + + mul_fp2(ret[0], c0, t1); + mul_fp2(ret[1], c1, t1); + mul_fp2(ret[2], c2, t1); +} + +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + sqr_fp6(t0, a[0]); + sqr_fp6(t1, a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(t0, t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(t0[0], t0[0], t1[2]); + sub_fp2(t0[1], t0[1], t1[0]); + sub_fp2(t0[2], t0[2], t1[1]); +#endif + + inverse_fp6(t1, t0); + + mul_fp6(ret[0], a[0], t1); + mul_fp6(ret[1], a[1], t1); + neg_fp6(ret[1], ret[1]); +} + +typedef vec384x vec384fp4[2]; + +#if defined(__FP2x2__) +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec768x t0, t1, t2; + + sqr_fp2x2(t0, a0); + sqr_fp2x2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2x2(t2, t1); + add_fp2x2(t2, t2, t0); + redc_fp2x2(ret[0], t2); + + sqr_fp2x2(t2, ret[1]); + sub_fp2x2(t2, t2, t0); + sub_fp2x2(t2, t2, t1); + redc_fp2x2(ret[1], t2); +} +#else +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec384x t0, t1; + + sqr_fp2(t0, a0); + sqr_fp2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2(ret[0], t1); + add_fp2(ret[0], ret[0], t0); + + sqr_fp2(ret[1], ret[1]); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); +} +#endif + +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp4 t0, t1, t2; + + sqr_fp4(t0, a[0][0], a[1][1]); + sqr_fp4(t1, a[1][0], a[0][2]); + sqr_fp4(t2, a[0][1], a[1][2]); + + sub_fp2(ret[0][0], t0[0], a[0][0]); + add_fp2(ret[0][0], ret[0][0], ret[0][0]); + add_fp2(ret[0][0], ret[0][0], t0[0]); + + sub_fp2(ret[0][1], t1[0], a[0][1]); + add_fp2(ret[0][1], ret[0][1], ret[0][1]); + add_fp2(ret[0][1], ret[0][1], t1[0]); + + sub_fp2(ret[0][2], t2[0], a[0][2]); + add_fp2(ret[0][2], ret[0][2], ret[0][2]); + add_fp2(ret[0][2], ret[0][2], t2[0]); + + mul_by_u_plus_1_fp2(t2[1], t2[1]); + add_fp2(ret[1][0], t2[1], a[1][0]); + add_fp2(ret[1][0], ret[1][0], ret[1][0]); + add_fp2(ret[1][0], ret[1][0], t2[1]); + + add_fp2(ret[1][1], t0[1], a[1][1]); + add_fp2(ret[1][1], ret[1][1], ret[1][1]); + add_fp2(ret[1][1], ret[1][1], t0[1]); + + add_fp2(ret[1][2], t1[1], a[1][2]); + add_fp2(ret[1][2], ret[1][2], ret[1][2]); + add_fp2(ret[1][2], ret[1][2], t1[1]); +} + +/* + * caveat lector! |n| has to be non-zero and not more than 3! + */ +static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n) +{ + vec_copy(ret[0], a[0], sizeof(ret[0])); + cneg_fp(ret[1], a[1], n & 1); +} + +static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n) +{ + static const vec384x coeffs1[] = { /* (u + 1)^((P^n - 1) / 3) */ + { { 0 }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } }, + { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a), + TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b), + TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } }, + { { 0 }, { ONE_MONT_P } } + }; + static const vec384 coeffs2[] = { /* (u + 1)^((2P^n - 2) / 3) */ + { TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + + frobenius_map_fp2(ret[0], a[0], n); + frobenius_map_fp2(ret[1], a[1], n); + frobenius_map_fp2(ret[2], a[2], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1], ret[1], coeffs1[n]); + mul_fp(ret[2][0], ret[2][0], coeffs2[n]); + mul_fp(ret[2][1], ret[2][1], coeffs2[n]); +} + +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + static const vec384x coeffs[] = { /* (u + 1)^((P^n - 1) / 6) */ + { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313), + TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee), + TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) }, + { TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec), + TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0), + TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } }, + { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c), + TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721), + TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } }, + { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }, + }; + + frobenius_map_fp6(ret[0], a[0], n); + frobenius_map_fp6(ret[1], a[1], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1][0], ret[1][0], coeffs[n]); + mul_fp2(ret[1][1], ret[1][1], coeffs[n]); + mul_fp2(ret[1][2], ret[1][2], coeffs[n]); +} + + +/* + * BLS12-381-specifc Fp12 shortcuts. + */ +void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) +{ sqr_fp12(ret, a); } + +void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a) +{ cyclotomic_sqr_fp12(ret, a); } + +void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ mul_fp12(ret, a, b); } + +void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ mul_by_xy00z0_fp12(ret, a, xy00z0); } + +void blst_fp12_conjugate(vec384fp12 a) +{ conjugate_fp12(a); } + +void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a) +{ inverse_fp12(ret, a); } + +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n) +{ frobenius_map_fp12(ret, a, n); } + +int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b) +{ return (int)vec_is_equal(a, b, sizeof(vec384fp12)); } + +int blst_fp12_is_one(const vec384fp12 a) +{ + return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & + vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0]))); +} + +const vec384fp12 *blst_fp12_one(void) +{ return (const vec384fp12 *)BLS12_381_Rx.p12; } + +void blst_bendian_from_fp12(unsigned char ret[48*12], const vec384fp12 a) +{ + size_t i, j; + vec384 out; + + for (i = 0; i < 3; i++) { + for (j = 0; j < 2; j++) { + from_fp(out, a[j][i][0]); + be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; + from_fp(out, a[j][i][1]); + be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; + } + } +} + +size_t blst_fp12_sizeof(void) +{ return sizeof(vec384fp12); } diff --git a/crypto/blst_src/hash_to_field.c b/crypto/blst_src/hash_to_field.c new file mode 100644 index 00000000000..6816ea8b922 --- /dev/null +++ b/crypto/blst_src/hash_to_field.c @@ -0,0 +1,177 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +static const vec384 BLS12_381_RRRR = { /* RR^2 */ + TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8), + TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761), + TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d) +}; + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +static void sha256_init_Zpad(SHA256_CTX *ctx) +{ + ctx->h[0] = 0xda5698beU; + ctx->h[1] = 0x17b9b469U; + ctx->h[2] = 0x62335799U; + ctx->h[3] = 0x779fbecaU; + ctx->h[4] = 0x8ce5d491U; + ctx->h[5] = 0xc0d26243U; + ctx->h[6] = 0xbafef9eaU; + ctx->h[7] = 0x1837a9d8U; + ctx->N = 64; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void vec_xor(void *restrict ret, const void *restrict a, + const void *restrict b, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i] ^ bp[i]; +} + +static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + union { limb_t align; unsigned char c[32]; } b_0; + union { limb_t align; unsigned char c[33+256+31]; } b_i; + unsigned char *p; + size_t i, b_i_bits, b_i_blocks; + SHA256_CTX ctx; + + /* + * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime' + */ + if (DST_len > 255) { + sha256_init(&ctx); + sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17); + sha256_update(&ctx, DST, DST_len); + sha256_final(b_0.c, &ctx); + DST = b_0.c, DST_len = 32; + } + b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64; + vec_zero(b_i.c + b_i_blocks - 64, 64); + + p = b_i.c + 33; + for (i = 0; i < DST_len; i++) + p[i] = DST[i]; + p[i++] = (unsigned char)DST_len; + p[i++] = 0x80; + p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0; + b_i_bits = (33 + DST_len + 1) * 8; + p = b_i.c + b_i_blocks; + p[-2] = (unsigned char)(b_i_bits >> 8); + p[-1] = (unsigned char)(b_i_bits); + + sha256_init_Zpad(&ctx); /* Z_pad | */ + sha256_update(&ctx, aug, aug_len); /* | aug | */ + sha256_update(&ctx, msg, msg_len); /* | msg | */ + /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime */ + b_i.c[30] = (unsigned char)(len_in_bytes >> 8); + b_i.c[31] = (unsigned char)(len_in_bytes); + b_i.c[32] = 0; + sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1); + sha256_final(b_0.c, &ctx); + + sha256_init_h(ctx.h); + vec_copy(b_i.c, b_0.c, 32); + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + + len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */ + len_in_bytes /= 32; /* caller being responsible for accordingly large + * buffer. hash_to_field passes one with length + * divisible by 64, remember? which works... */ + while (--len_in_bytes) { + sha256_init_h(ctx.h); + vec_xor(b_i.c, b_0.c, bytes, 32); + bytes += 32; + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + } +} +#endif + +/* + * |nelems| is 'count * m' from spec + */ +static void hash_to_field(vec384 elems[], size_t nelems, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t L = sizeof(vec384) + 128/8; /* ceil((ceil(log2(p)) + k) / 8) */ + size_t len_in_bytes = L * nelems; /* divisible by 64, hurray! */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + limb_t *pseudo_random = alloca(len_in_bytes); +#else + limb_t pseudo_random[len_in_bytes/sizeof(limb_t)]; +#endif + unsigned char *bytes; + vec768 elem; + + aug_len = aug!=NULL ? aug_len : 0; + DST_len = DST!=NULL ? DST_len : 0; + + expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes, + aug, aug_len, msg, msg_len, DST, DST_len); + + vec_zero(elem, sizeof(elem)); + bytes = (unsigned char *)pseudo_random; + while (nelems--) { + limbs_from_be_bytes(elem, bytes, L); + bytes += L; + /* + * L-bytes block % P, output is in Montgomery domain... + */ + redc_mont_384(elems[0], elem, BLS12_381_P, p0); + mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0); + elems++; + } +} + +void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t buf_len = (len_in_bytes+31) & ((size_t)0-32); + unsigned char *buf_ptr = bytes; + + if (buf_len > 255*32) + return; + + if (buf_len != len_in_bytes) + buf_ptr = alloca(buf_len); + + expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len, + DST, DST_len); + if (buf_ptr != bytes) { + unsigned char *ptr = buf_ptr; + while (len_in_bytes--) + *bytes++ = *ptr++; + vec_zero(buf_ptr, buf_len); + } +} diff --git a/crypto/blst_src/keygen.c b/crypto/blst_src/keygen.c new file mode 100644 index 00000000000..9b62f16b534 --- /dev/null +++ b/crypto/blst_src/keygen.c @@ -0,0 +1,319 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "bytes.h" +#include "sha256.h" + +typedef struct { + SHA256_CTX ctx; + unsigned int h_ipad[8]; + unsigned int h_opad[8]; + union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail; +} HMAC_SHA256_CTX; + +static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len) +{ + size_t i; + + if (K == NULL) { /* reuse h_ipad and h_opad */ + sha256_hcopy(ctx->ctx.h, ctx->h_ipad); + ctx->ctx.N = 64; + vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf)); + ctx->ctx.off = 0; + + return; + } + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + if (K_len > 64) { + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, K, K_len); + sha256_final(ctx->tail.c, &ctx->ctx); + } else { + sha256_bcopy(ctx->tail.c, K, K_len); + } + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)0x3636363636363636; + + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, ctx->tail.c, 64); + sha256_hcopy(ctx->h_ipad, ctx->ctx.h); + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c); + + sha256_init_h(ctx->h_opad); + sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1); + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + ctx->tail.c[32] = 0x80; + ctx->tail.c[62] = 3; /* (64+32)*8 in big endian */ + ctx->tail.c[63] = 0; +} + +static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp, + size_t len) +{ sha256_update(&ctx->ctx, inp, len); } + +static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx) +{ + sha256_final(ctx->tail.c, &ctx->ctx); + sha256_hcopy(ctx->ctx.h, ctx->h_opad); + sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1); + sha256_emit(md, ctx->ctx.h); +} + +static void HKDF_Extract(unsigned char PRK[32], + const void *salt, size_t salt_len, + const void *IKM, size_t IKM_len, +#ifndef __BLST_HKDF_TESTMODE__ + int IKM_fixup, +#endif + HMAC_SHA256_CTX *ctx) +{ + unsigned char zero[1] = { 0 }; + + HMAC_init(ctx, salt != NULL ? salt : zero, salt_len); + HMAC_update(ctx, IKM, IKM_len); +#ifndef __BLST_HKDF_TESTMODE__ + if (IKM_fixup) { + /* Section 2.3 KeyGen in BLS-signature draft */ + HMAC_update(ctx, zero, 1); + } +#endif + HMAC_final(PRK, ctx); +} + +static void HKDF_Expand(unsigned char *OKM, size_t L, + const unsigned char PRK[32], + const void *info, size_t info_len, +#ifndef __BLST_HKDF_TESTMODE__ + int info_fixup, +#endif + HMAC_SHA256_CTX *ctx) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + unsigned char *info_prime = alloca(info_len + 2 + 1); +#else + unsigned char info_prime[info_len + 2 + 1]; +#endif + + HMAC_init(ctx, PRK, 32); + + if (info_len != 0) + sha256_bcopy(info_prime, info, info_len); +#ifndef __BLST_HKDF_TESTMODE__ + if (info_fixup) { + /* Section 2.3 KeyGen in BLS-signature draft */ + info_prime[info_len + 0] = (unsigned char)(L >> 8); + info_prime[info_len + 1] = (unsigned char)(L); + info_len += 2; + } +#endif + info_prime[info_len] = 1; /* counter */ + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + while (L > 32) { + sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c); + OKM += 32; L -= 32; + ++info_prime[info_len]; /* counter */ + HMAC_init(ctx, NULL, 0); + HMAC_update(ctx, ctx->tail.c, 32); + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + } + sha256_bcopy(OKM, ctx->tail.c, L); +} + +#ifndef __BLST_HKDF_TESTMODE__ +static void keygen(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len, + int version) +{ + struct { + HMAC_SHA256_CTX ctx; + unsigned char PRK[32], OKM[48]; + vec512 key; + } scratch; + unsigned char salt_prime[32] = "BLS-SIG-KEYGEN-SALT-"; + + if (IKM_len < 32 || (version > 4 && salt == NULL)) { + vec_zero(SK, sizeof(pow256)); + return; + } + + /* + * Vet |info| since some callers were caught to be sloppy, e.g. + * SWIG-4.0-generated Python wrapper... + */ + info_len = info==NULL ? 0 : info_len; + + if (salt == NULL) { + salt = salt_prime; + salt_len = 20; + } + + if (version == 4) { + /* salt = H(salt) */ + sha256_init(&scratch.ctx.ctx); + sha256_update(&scratch.ctx.ctx, salt, salt_len); + sha256_final(salt_prime, &scratch.ctx.ctx); + salt = salt_prime; + salt_len = sizeof(salt_prime); + } + + while (1) { + /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */ + HKDF_Extract(scratch.PRK, salt, salt_len, + IKM, IKM_len, 1, &scratch.ctx); + + /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */ + HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK, + info, info_len, 1, &scratch.ctx); + + /* SK = OS2IP(OKM) mod r */ + vec_zero(scratch.key, sizeof(scratch.key)); + limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM)); + redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0); + /* + * Given that mul_mont_sparse_256 has special boundary conditions + * it's appropriate to mention that redc_mont_256 output is fully + * reduced at this point. Because we started with 384-bit input, + * one with most significant half smaller than the modulus. + */ + mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR, + BLS12_381_r, r0); + + if (version < 4 || !vec_is_zero(scratch.key, sizeof(vec256))) + break; + + /* salt = H(salt) */ + sha256_init(&scratch.ctx.ctx); + sha256_update(&scratch.ctx.ctx, salt, salt_len); + sha256_final(salt_prime, &scratch.ctx.ctx); + salt = salt_prime; + salt_len = sizeof(salt_prime); + } + + le_bytes_from_limbs(SK, scratch.key, sizeof(pow256)); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + vec_zero(&scratch, sizeof(scratch)); +} + +void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 4); } + +void blst_keygen_v3(pow256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 3); } + +void blst_keygen_v4_5(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 4); } + +void blst_keygen_v5(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 5); } + +/* + * https://eips.ethereum.org/EIPS/eip-2333 + */ +void blst_derive_master_eip2333(pow256 SK, const void *seed, size_t seed_len) +{ keygen(SK, seed, seed_len, NULL, 0, NULL, 0, 4); } + +static void parent_SK_to_lamport_PK(pow256 PK, const pow256 parent_SK, + unsigned int index) +{ + size_t i; + struct { + HMAC_SHA256_CTX ctx; + SHA256_CTX ret; + unsigned char PRK[32], IKM[32]; + unsigned char lamport[255][32]; + } scratch; + + /* salt = I2OSP(index, 4) */ + unsigned char salt[4] = { (unsigned char)(index>>24), + (unsigned char)(index>>16), + (unsigned char)(index>>8), + (unsigned char)(index) }; + + /* IKM = I2OSP(parent_SK, 32) */ + for (i = 0; i < 32; i++) + scratch.IKM[i] = parent_SK[31-i]; + + /* lamport_0 = IKM_to_lamport_SK(IKM, salt) */ + HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, + &scratch.ctx); + HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), + scratch.PRK, NULL, 0, 0, &scratch.ctx); + + vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); + scratch.ctx.ctx.buf[32] = 0x80; + scratch.ctx.ctx.buf[62] = 1; /* 32*8 in big endian */ + scratch.ctx.ctx.buf[63] = 0; + for (i = 0; i < 255; i++) { + /* lamport_PK = lamport_PK | SHA256(lamport_0[i]) */ + sha256_init_h(scratch.ctx.ctx.h); + sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); + sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); + sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); + } + + /* compressed_lamport_PK = SHA256(lamport_PK) */ + sha256_init(&scratch.ret); + sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); + + /* not_IKM = flip_bits(IKM) */ + for (i = 0; i< 32; i++) + scratch.IKM[i] = ~scratch.IKM[i]; + + /* lamport_1 = IKM_to_lamport_SK(not_IKM, salt) */ + HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, + &scratch.ctx); + HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), + scratch.PRK, NULL, 0, 0, &scratch.ctx); + + vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); + scratch.ctx.ctx.buf[32] = 0x80; + scratch.ctx.ctx.buf[62] = 1; + for (i = 0; i < 255; i++) { + /* lamport_PK = lamport_PK | SHA256(lamport_1[i]) */ + sha256_init_h(scratch.ctx.ctx.h); + sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); + sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); + sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); + } + + /* compressed_lamport_PK = SHA256(lamport_PK) */ + sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); + sha256_final(PK, &scratch.ret); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + vec_zero(&scratch, sizeof(scratch)); +} + +void blst_derive_child_eip2333(pow256 SK, const pow256 parent_SK, + unsigned int child_index) +{ + parent_SK_to_lamport_PK(SK, parent_SK, child_index); + keygen(SK, SK, sizeof(pow256), NULL, 0, NULL, 0, 4); +} +#endif diff --git a/crypto/blst_src/map_to_g1.c b/crypto/blst_src/map_to_g1.c new file mode 100644 index 00000000000..6613d68bb29 --- /dev/null +++ b/crypto/blst_src/map_to_g1.c @@ -0,0 +1,559 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384 Aprime_E1 = { + /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8 + d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */ + TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3), + TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c), + TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85) +}; +static const vec384 Bprime_E1 = { + /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070 + a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */ + TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f), + TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571), + TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b) +}; + +static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[], + const vec384 Zz_powers[], size_t n) +{ + while (n--) + mul_fp(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n) +{ + while (n--) { + mul_fp(acc, acc, x); + add_fp(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 + + * ... + k_(1,0) + * ... + */ + static const vec384 isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c), + TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304), + TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4) }, + { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa), + TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad), + TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad) }, + { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6), + TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7), + TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524) }, + { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021), + TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60), + TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb) }, + { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185), + TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c), + TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6) }, + { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd), + TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db), + TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929) }, + { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9), + TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f), + TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe) }, + { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81), + TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65), + TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028) }, + { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475), + TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb), + TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac) }, + { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2), + TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f), + TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375) }, + { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375), + TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb), + TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c) }, + { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5), + TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb), + TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d) } + }; + /* ... + * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0) + */ + static const vec384 isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0), + TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e), + TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f) }, + { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a), + TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93), + TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce) }, + { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0), + TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd), + TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2) }, + { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb), + TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df), + TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c) }, + { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f), + TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42), + TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015) }, + { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4), + TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2), + TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb) }, + { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3), + TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4), + TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606) }, + { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0), + TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23), + TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122) }, + { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7), + TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c), + TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34) }, + { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0), + TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f), + TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8) } + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 + + * ... + k_(3,0) + * ... + */ + static const vec384 isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767), + TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df), + TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310) }, + { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878), + TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8), + TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555) }, + { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d), + TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0), + TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905) }, + { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707), + TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53), + TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257) }, + { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd), + TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f), + TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d) }, + { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982), + TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7), + TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793) }, + { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19), + TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0), + TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f) }, + { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091), + TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f), + TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79) }, + { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233), + TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482), + TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393) }, + { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17), + TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46), + TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb) }, + { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f), + TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d), + TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5) }, + { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4), + TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d), + TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4) }, + { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5), + TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef), + TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2) }, + { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a), + TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f), + TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49) }, + { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f), + TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee), + TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f) }, + { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a), + TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838), + TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9) } + }; + /* ... + * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0) + */ + static const vec384 isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60), + TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323), + TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1) }, + { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911), + TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658), + TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e) }, + { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742), + TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d), + TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41) }, + { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f), + TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806), + TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd) }, + { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700), + TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4), + TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb) }, + { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55), + TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc), + TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4) }, + { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0), + TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3), + TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed) }, + { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930), + TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6), + TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406) }, + { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497), + TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb), + TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5) }, + { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30), + TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023), + TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff) }, + { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b), + TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8), + TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e) }, + { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03), + TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45), + TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff) }, + { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154), + TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153), + TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c) }, + { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55), + TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc), + TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6) }, + { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21), + TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467), + TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e) } + }; + vec384 Zz_powers[15], map[15], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp(Zz_powers[14], p->Z); /* ZZ^1 */ +#ifdef __OPTIMIZE_SIZE__ + for (size_t i = 14; i > 0; i--) + mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]); +#else + sqr_fp(Zz_powers[13], Zz_powers[14]); /* ZZ^2 1+1 */ + mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3 2+1 */ + sqr_fp(Zz_powers[11], Zz_powers[13]); /* ZZ^4 2+2 */ + mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5 2+3 */ + sqr_fp(Zz_powers[9], Zz_powers[12]); /* ZZ^6 3+3 */ + mul_fp(Zz_powers[8], Zz_powers[12], Zz_powers[11]);/* ZZ^7 3+4 */ + sqr_fp(Zz_powers[7], Zz_powers[11]); /* ZZ^8 4+4 */ + mul_fp(Zz_powers[6], Zz_powers[11], Zz_powers[10]);/* ZZ^9 4+5 */ + sqr_fp(Zz_powers[5], Zz_powers[10]); /* ZZ^10 5+5 */ + mul_fp(Zz_powers[4], Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6 */ + sqr_fp(Zz_powers[3], Zz_powers[9]); /* ZZ^12 6+6 */ + mul_fp(Zz_powers[2], Zz_powers[9], Zz_powers[8]); /* ZZ^13 6+7 */ + sqr_fp(Zz_powers[1], Zz_powers[8]); /* ZZ^14 7+7 */ + mul_fp(Zz_powers[0], Zz_powers[8], Zz_powers[7]); /* ZZ^15 7+8 */ +#endif + + map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11); + mul_fp(xn, p->X, isogeny_map_x_num[11]); + add_fp(xn, xn, map[10]); + map_fp(xn, p->X, map, 10); + + map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10); + add_fp(xd, p->X, map[9]); + map_fp(xd, p->X, map, 9); + mul_fp(xd, xd, Zz_powers[14]); /* xd *= Z^2 */ + + map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15); + mul_fp(yn, p->X, isogeny_map_y_num[15]); + add_fp(yn, yn, map[14]); + map_fp(yn, p->X, map, 14); + mul_fp(yn, yn, p->Y); /* yn *= Y */ + + map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15); + add_fp(yd, p->X, map[14]); + map_fp(yd, p->X, map, 14); + mul_fp(Zz_powers[14], Zz_powers[14], p->Z); + mul_fp(yd, yd, Zz_powers[14]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp(out->X, xn, yd); + mul_fp(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp(out->Y, out->Z); + mul_fp(out->Y, out->Y, xd); + mul_fp(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u) +{ + static const vec384 minus_A = { /* P - A */ + TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c), + TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442), + TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915) + }; + static const vec384 Z = { /* (11<<384) % P */ + TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d), + TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740), + TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8) + }; + static const vec384 sqrt_minus_ZZZ = { + TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2), + TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b), + TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff) + }; + static const vec384 ZxA = { + TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65), + TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1), + TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c) + }; + vec384 uu, tv2, x2n, gx1, gxd, y2; +#if 0 + vec384 xn, x1n, xd, y, y1, Zuu, tv4; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +# define tv4 y1 +#endif +#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + */ + /* x numerator variants */ + sqr_fp(uu, u); /* uu = u^2 */ + mul_fp(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp(x1n, tv2, BLS12_381_Rx.p); /* x1n = tv2 + 1 */ + mul_fp(x1n, x1n, Bprime_E1); /* x1n = x1n * B */ + mul_fp(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp(tv2, xd); /* tv2 = xd^2 */ + mul_fp(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp(tv2, Aprime_E1, tv2); /* tv2 = A * tv2 */ + sqr_fp(gx1, x1n); /* gx1 = x1n^2 */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp(tv2, Bprime_E1, gxd); /* tv2 = B * gxd */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p-3)/4) */ + mul_fp(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp(y2, y1, sqrt_minus_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(-Z^3) */ + mul_fp(y2, y2, uu); /* y2 = y2 * uu */ + mul_fp(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp(u); + e2 = sgn0_fp(y); + cneg_fp(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp(p->X, xn, xd); /* X = xn * xd */ + mul_fp(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp +} + +static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n) +{ + POINTonE1_dadd(out, out, p, NULL); + while(n--) + POINTonE1_double(out, out); +} + +static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in) +{ + POINTonE1_double(out, in); /* 1: 0x2 */ + POINTonE1_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE1_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE1_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE1_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE1_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ + POINTonE1 p; + + map_to_isogenous_E1(&p, u); + + if (v != NULL) { + map_to_isogenous_E1(out, v); /* borrow |out| */ + POINTonE1_dadd(&p, &p, out, Aprime_E1); + } + + isogeny_map_to_E1(&p, &p); /* sprinkle isogenous powder */ + + /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */ + POINTonE1_times_minus_z(out, &p); + POINTonE1_dadd(out, out, &p, NULL); +} + +void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ map_to_g1(out, u, v); } + +static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[1]; + + hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], NULL); +} + +void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[2]; + + hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], u[1]); +} + +void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void sigma(POINTonE1 *out, const POINTonE1 *in); + +#if 0 +#ifdef __OPTIMIZE_SIZE__ +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + static const byte zz_minus_1_div_by_3[] = { + TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156) + }; + size_t n = 126-1; + const POINTonE1 *dblin = in; + + while(n--) { + POINTonE1_double(out, dblin); dblin = out; + if (is_bit_set(zz_minus_1_div_by_3, n)) + POINTonE1_dadd(out, out, in, NULL); + } +} +#else +static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p) +{ + while(n--) + POINTonE1_double(out, out); + POINTonE1_dadd(out, out, p, NULL); +} + +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + POINTonE1 t3, t5, t7, t11, t85; + + POINTonE1_double(&t7, in); /* 2P */ + POINTonE1_dadd(&t3, &t7, in, NULL); /* 3P */ + POINTonE1_dadd(&t5, &t3, &t7, NULL); /* 5P */ + POINTonE1_dadd(&t7, &t5, &t7, NULL); /* 7P */ + POINTonE1_double(&t85, &t5); /* 10P */ + POINTonE1_dadd(&t11, &t85, in, NULL); /* 11P */ + POINTonE1_dbl_n_add(&t85, 3, &t5); /* 0x55P */ + /* (-0xd201000000010000^2 - 1) / 3 */ + POINTonE1_double(out, &t7); /* 0xe */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb */ + POINTonE1_dbl_n_add(out, 3, &t3); /* 0xe5b */ + POINTonE1_dbl_n_add(out, 3, in); /* 0x72d9 */ + POINTonE1_dbl_n_add(out, 5, &t3); /* 0xe5b23 */ + POINTonE1_dbl_n_add(out, 18, &t85); /* 0x396c8c0055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555 */ + POINTonE1_dbl_n_add(out, 3, &t7); /* 0x1cb646002aaaf */ + POINTonE1_dbl_n_add(out, 7, &t5); /* 0xe5b23001555785 */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb646002aaaf0ab */ + POINTonE1_dbl_n_add(out, 41, &t85); /* 0x396c8c005555e1560000000055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e156000000005555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e15600000000555555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e1560000000055555555 */ +} +#endif + +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + sigma(&t0, P); /* σ(P) */ + sigma(&t1, &t0); /* σ²(P) */ + + POINTonE1_double(&t0, &t0); /* 2σ(P) */ + POINTonE1_dadd(&t2, &t1, P, NULL); /* P + σ²(P) */ + POINTonE1_cneg(&t2, 1); /* - P - σ²(P) */ + POINTonE1_dadd(&t2, &t2, &t0, NULL); /* 2σ(P) - P - σ²(P) */ + POINTonE1_times_zz_minus_1_div_by_3( &t0, &t2); + POINTonE1_cneg(&t1, 1); + POINTonE1_dadd(&t0, &t0, &t1, NULL); /* [(z²-1)/3](2σ(P) - P - σ²(P)) */ + /* - σ²(P) */ + return vec_is_zero(t0.Z, sizeof(t0.Z)); +} +#else +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + POINTonE1_times_minus_z(&t0, P); + POINTonE1_times_minus_z(&t1, &t0); + POINTonE1_cneg(&t1, 1); /* [-z²]P */ + + sigma(&t0, P); /* σ(P) */ + sigma(&t0, &t0); /* σ²(P) */ + + return POINTonE1_is_equal(&t0, &t1); +} +#endif + +int blst_p1_in_g1(const POINTonE1 *p) +{ return (int)POINTonE1_in_G1(p); } + +int blst_p1_affine_in_g1(const POINTonE1_affine *p) +{ + POINTonE1 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE1_in_G1(&P); +} diff --git a/crypto/blst_src/map_to_g2.c b/crypto/blst_src/map_to_g2.c new file mode 100644 index 00000000000..90fd86e9d31 --- /dev/null +++ b/crypto/blst_src/map_to_g2.c @@ -0,0 +1,444 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384x Aprime_E2 = { /* 240*i */ + { 0 }, + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) } +}; +static const vec384x Bprime_E2 = { /* 1012 + 1012*i */ + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }, + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) } +}; + +static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[], + const vec384x Zz_powers[], size_t n) +{ + while (n--) + mul_fp2(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n) +{ + while (n--) { + mul_fp2(acc, acc, x); + add_fp2(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0) + * ... + */ + static const vec384x isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }, + { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }}, + {{ 0 }, + { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3), + TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945), + TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }}, + {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae), + TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c), + TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) }, + { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551), + TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2), + TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }}, + {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e), + TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1), + TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) }, + { 0 }} + }; + /* ... + * x_den = x'^2 + k_(2,1) * x' + k_(2,0) + */ + static const vec384x isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + {{ 0 }, + { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e), + TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18), + TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }}, + {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020), + TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6), + TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) }, + { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf), + TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8), + TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }} + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0) + * ... + */ + static const vec384x isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }, + { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }}, + {{ 0 }, + { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd), + TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5), + TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }}, + {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8), + TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251), + TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) }, + { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556), + TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e), + TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }}, + {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425), + TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e), + TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) }, + { 0 }} + }; + /* ... + * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0) + */ + static const vec384x isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }, + { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }}, + {{ 0 }, + { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba), + TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a), + TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }}, + {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030), + TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9), + TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) }, + { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf), + TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915), + TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }} + }; + vec384x Zz_powers[3], map[3], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp2(Zz_powers[2], p->Z); /* ZZ^1 */ + sqr_fp2(Zz_powers[1], Zz_powers[2]); /* ZZ^2 1+1 */ + mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3 2+1 */ + + map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3); + mul_fp2(xn, p->X, isogeny_map_x_num[3]); + add_fp2(xn, xn, map[2]); + map_fp2(xn, p->X, map, 2); + + map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2); + add_fp2(xd, p->X, map[1]); + map_fp2(xd, p->X, map, 1); + mul_fp2(xd, xd, Zz_powers[2]); /* xd *= Z^2 */ + + map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3); + mul_fp2(yn, p->X, isogeny_map_y_num[3]); + add_fp2(yn, yn, map[2]); + map_fp2(yn, p->X, map, 2); + mul_fp2(yn, yn, p->Y); /* yn *= Y */ + + map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3); + add_fp2(yd, p->X, map[2]); + map_fp2(yd, p->X, map, 2); + mul_fp2(Zz_powers[2], Zz_powers[2], p->Z); + mul_fp2(yd, yd, Zz_powers[2]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp2(out->X, xn, yd); + mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp2(out->Y, out->Z); + mul_fp2(out->Y, out->Y, xd); + mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u) +{ + static const vec384x minus_A = { + { 0 }, + { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79), + TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd), + TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) } + }; + static const vec384x Z = { /* -2 - i */ + { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa), + TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4), + TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + static const vec384x recip_ZZZ = { /* 1/(Z^3) */ + { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916), + TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f), + TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) }, + { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604), + TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7), + TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) } + }; + static const vec384x magic_ZZZ = { /* 1/Z^3 = a + b*i */ + /* a^2 + b^2 */ + { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374), + TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7), + TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) }, + /* (a^2 + b^2)^((P-3)/4) */ + { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19), + TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff), + TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) } + }; + static const vec384x ZxA = { /* 240 - 480*i */ + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }, + { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3), + TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a), + TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) } + }; + vec384x uu, tv2, tv4, x2n, gx1, gxd, y2; +#if 0 + vec384x xn, x1n, xd, y, y1, Zuu; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +#endif +#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + * with 9mod16 twists... + */ + /* x numerator variants */ + sqr_fp2(uu, u); /* uu = u^2 */ + mul_fp2(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp2(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp2(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1 */ + mul_fp2(x1n, x1n, Bprime_E2); /* x1n = x1n * B */ + mul_fp2(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp2(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp2(tv2, xd); /* tv2 = xd^2 */ + mul_fp2(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp2(tv2, Aprime_E2, tv2); /* tv2 = A * tv2 */ + sqr_fp2(gx1, x1n); /* gx1 = x1n^2 */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp2(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp2(tv2, Bprime_E2, gxd); /* tv2 = B * gxd */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp2(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp2(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp2(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp2(y1, tv4, /* y1 = tv4^c1 # (gx1*gxd^3)^((p^2-9)/16) */ + recip_ZZZ, magic_ZZZ); + mul_fp2(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp2(y2, y1, uu); /* y2 = y1 * uu */ + mul_fp2(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp2(u); + e2 = sgn0_fp2(y); + cneg_fp2(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp2(p->X, xn, xd); /* X = xn * xd */ + mul_fp2(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp2 +} + +#if 0 +static const byte h_eff[] = { + TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4), + TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a), + TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95), + TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768), + TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3) +}; + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ POINTonE2_mult_w5(out, p, h_eff, 636); } +#else +/* + * As per suggestions in "7. Clearing the cofactor" at + * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06 + */ +static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n) +{ + POINTonE2_dadd(out, out, p, NULL); + while(n--) + POINTonE2_double(out, out); +} + +static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in) +{ + POINTonE2_double(out, in); /* 1: 0x2 */ + POINTonE2_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE2_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE2_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE2_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE2_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +static void psi(POINTonE2 *out, const POINTonE2 *in); + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ + POINTonE2 t0, t1; + + /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves" */ + POINTonE2_double(out, p); /* out = 2P */ + psi(out, out); /* out = Ψ(2P) */ + psi(out, out); /* out = Ψ²(2P) */ + + vec_copy(&t0, p, sizeof(t0)); + POINTonE2_cneg(&t0, 1); /* t0 = -P */ + psi(&t1, &t0); /* t1 = -Ψ(P) */ + POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P) */ + + POINTonE2_times_minus_z(&t0, p); /* t0 = [-z]P */ + POINTonE2_dadd(&t0, &t0, p, NULL); /* t0 = [-z + 1]P */ + POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P) */ + POINTonE2_times_minus_z(&t1, &t0); /* t1 = [z² - z]P + [z]Ψ(P) */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P */ + /* + [z - 1]Ψ(P) */ + /* + Ψ²(2P) */ +} +#endif + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ + POINTonE2 p; + + map_to_isogenous_E2(&p, u); + + if (v != NULL) { + map_to_isogenous_E2(out, v); /* borrow |out| */ + POINTonE2_dadd(&p, &p, out, Aprime_E2); + } + + isogeny_map_to_E2(&p, &p); /* sprinkle isogenous powder */ + clear_cofactor(out, &p); +} + +void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ map_to_g2(out, u, v); } + +static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[1]; + + hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], NULL); +} + +void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[2]; + + hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], u[1]); +} + +void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static bool_t POINTonE2_in_G2(const POINTonE2 *P) +{ +#if 0 + POINTonE2 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + psi(&t0, P); /* Ψ(P) */ + psi(&t0, &t0); /* Ψ²(P) */ + psi(&t1, &t0); /* Ψ³(P) */ + + POINTonE2_times_minus_z(&t2, &t1); + POINTonE2_dadd(&t0, &t0, &t2, NULL); + POINTonE2_cneg(&t0, 1); + POINTonE2_dadd(&t0, &t0, P, NULL); /* [z]Ψ³(P) - Ψ²(P) + P */ + + return vec_is_zero(t0.Z, sizeof(t0.Z)); +#else + POINTonE2 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + psi(&t0, P); /* Ψ(P) */ + + POINTonE2_times_minus_z(&t1, P); + POINTonE2_cneg(&t1, 1); /* [z]P */ + + return POINTonE2_is_equal(&t0, &t1); +#endif +} + +int blst_p2_in_g2(const POINTonE2 *p) +{ return (int)POINTonE2_in_G2(p); } + +int blst_p2_affine_in_g2(const POINTonE2_affine *p) +{ + POINTonE2 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE2_in_G2(&P); +} diff --git a/crypto/blst_src/multi_scalar.c b/crypto/blst_src/multi_scalar.c new file mode 100644 index 00000000000..d0b3deefe25 --- /dev/null +++ b/crypto/blst_src/multi_scalar.c @@ -0,0 +1,414 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * Infinite point among inputs would be devastating. Shall we change it? + */ +#define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \ +static void ptype##s_to_affine(ptype##_affine dst[], \ + const ptype *const points[], size_t npoints) \ +{ \ + size_t i; \ + vec##bits *acc, ZZ, ZZZ; \ + const ptype *point = NULL; \ + const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \ +\ + while (npoints) { \ + const ptype *p, *const *walkback; \ + size_t delta = strideZ, sizeof(vec##bits)); \ + for (i = 1; i < delta; i++, acc++) \ + point = *points ? *points++ : point+1, \ + mul_##field(acc[0], acc[-1], point->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + walkback = points-1, p = point, --delta, dst += delta; \ + for (i = 0; i < delta; i++, acc--, dst--) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], p->Z, acc[0]); \ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + p = (p == *walkback) ? *--walkback : p-1; \ + } \ + sqr_##field(ZZ, acc[0]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[0]); /* 1/Z^3 */\ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + ++delta, dst += delta, npoints -= delta; \ + } \ +} \ +\ +void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \ + size_t npoints) \ +{ ptype##s_to_affine(dst, points, npoints); } + +POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp) +POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2) + +/* + * This is two-step multi-scalar multiplication procedure. First, given + * a set of points you pre-compute a table for chosen windowing factor + * [expressed in bits with value between 2 and 14], and then you pass + * this table to the actual multiplication procedure along with scalars. + * Idea is that the pre-computed table will be reused multiple times. In + * which case multiplication runs faster than below Pippenger algorithm + * implementation for up to ~16K points for wbits=8, naturally at the + * expense of multi-megabyte table. One can trade even more memory for + * performance, but each wbits increment doubles the memory requirement, + * so at some point it gets prohibively large... For reference, without + * reusing the table it's faster than Pippenger algorithm for up ~32 + * points [with wbits=5]... + */ + +#define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096) + +#define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \ + const ptype##_affine *point) \ +{ \ + size_t i, j, n = (size_t)1 << (wbits-1); \ + /* row[-1] is implicit infinity */\ + vec_copy(&row[0], point, sizeof(*point)); /* row[0]=p*1 */\ + vec_copy(&row[0].Z, one, sizeof(row[0].Z)); \ + ptype##_double(&row[1], &row[0]); /* row[1]=p*(1+1) */\ + for (i = 2, j = 1; i < n; i += 2, j++) \ + ptype##_add_affine(&row[i], &row[i-1], point), /* row[2]=p*(2+1) */\ + ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ +} /* row[4] ... */\ +\ +static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \ + size_t wbits, size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t i, j; \ + vec##bits *acc, ZZ, ZZZ; \ +\ + src += total; \ + acc = (vec##bits *)src; \ + vec_copy(acc++, one, sizeof(vec##bits)); \ + for (i = 0; i < npoints; i++) \ + for (j = nwin; --src, --j; acc++) \ + mul_##field(acc[0], acc[-1], src->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + for (i = 0; i < npoints; i++) { \ + vec_copy(dst++, src++, sizeof(ptype##_affine)); \ + for (j = 1; j < nwin; j++, acc--, src++, dst++) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], src->Z, acc[0]); \ + mul_##field(dst->X, src->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, src->Y, ZZZ); /* Y = Y'/Z^3 */\ + } \ + } \ +} \ +\ +/* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\ +static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \ + size_t i, top = 0; \ + ptype *rows, *row; \ + const ptype##_affine *point = NULL; \ + size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \ + if (stride == 0) stride = 1; \ +\ + while (npoints >= nmin) { \ + size_t limit = total - npoints; \ +\ + if (top + (stride << wbits) > limit) { \ + stride = (limit - top) >> wbits; \ + if (stride == 0) break; \ + } \ + rows = row = (ptype *)(&table[top]); \ + for (i = 0; i < stride; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \ + top += stride << (wbits-1); \ + npoints -= stride; \ + } \ + rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \ + for (i = 0; i < npoints; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \ +} \ +\ +size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \ +{ return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \ +void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ ptype##s_precompute_wbits(table, wbits, points, npoints); } + +#define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \ + size_t wbits, limb_t booth_idx) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ + bool_t idx_is_zero; \ + static const ptype##_affine infinity = { 0 }; \ +\ + booth_idx &= ((limb_t)1 << wbits) - 1; \ + idx_is_zero = is_zero(booth_idx); \ + booth_idx -= 1 ^ idx_is_zero; \ + vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \ + ptype##_cneg(p, booth_sign); \ +} \ +\ +static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ \ + limb_t wmask, wval; \ + size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \ + const byte *scalar, *const *scalar_s = scalars; \ + const ptype##_affine *row = table; \ +\ + size_t scratch_sz = SCRATCH_SZ(ptype); \ + if (scratch == NULL) { \ + scratch_sz /= 4; /* limit to 288K */ \ + scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \ + scratch = alloca(sizeof(ptype) * scratch_sz); \ + } \ +\ + nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ + scalar = *scalar_s++; \ +\ + /* top excess bits modulo target window size */ \ + window = nbits % wbits; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + nbits -= window; \ + z = is_zero(nbits); \ + wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \ + row += nwin; \ +\ + i = 1; vec_zero(ret, sizeof(*ret)); \ + while (nbits > 0) { \ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +\ + for (j = 0; j < wbits; j++) \ + ptype##_double(ret, ret); \ +\ + window = wbits; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + nbits -= window; \ + i = 0; row = table; scalar_s = scalars; \ + } \ +\ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +} \ +\ +size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \ +{ \ + const size_t scratch_sz = SCRATCH_SZ(ptype); \ + return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \ +} \ +void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); } + +PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) + +PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) + +/* + * Pippenger algorithm implementation, fastest option for larger amount + * of points... + */ + +static size_t pippenger_window_size(size_t npoints) +{ + size_t wbits; + + for (wbits=0; npoints>>=1; wbits++) ; + + return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1)); +} + +#define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \ +typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz; + +#define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \ +static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \ + size_t wbits) \ +{ \ + ptype##xyzz ret[1], acc[1]; \ + size_t n = (size_t)1 << wbits; \ +\ + /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\ + vec_copy(acc, &buckets[--n], sizeof(acc)); \ + vec_copy(ret, &buckets[n], sizeof(ret)); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + while (n--) { \ + ptype##xyzz_dadd(acc, acc, &buckets[n]); \ + ptype##xyzz_dadd(ret, ret, acc); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + } \ + ptype##xyzz_to_Jacobian(out, ret); \ +} \ +\ +static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \ + size_t wbits, const ptype##_affine *p) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ +\ + booth_idx &= (1< nbits) wbits = nbits - bit0, cbits = wbits + 1; \ + else wbits = cbits = window; \ + ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \ + bit0, wbits, cbits); \ +} \ +void prefix##s_mult_pippenger(ptype *ret, \ + const ptype##_affine *const points[], \ + size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype##xyzz scratch[]) \ +{ ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); } + +DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1) + +DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2) diff --git a/crypto/blst_src/no_asm.h b/crypto/blst_src/no_asm.h new file mode 100644 index 00000000000..be7bf47e197 --- /dev/null +++ b/crypto/blst_src/no_asm.h @@ -0,0 +1,1345 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#if LIMB_T_BITS==32 +typedef unsigned long long llimb_t; +#endif + +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 || defined(__STDC_NO_VLA__) +# error "unsupported compiler" +#endif + +#if defined(__clang__) +# pragma GCC diagnostic ignored "-Wstatic-in-inline" +#endif + +#if !defined(__clang__) && !defined(__builtin_assume) +# if defined(__GNUC__) && __GNUC__>=5 +# define __builtin_assume(condition) if (!(condition)) __builtin_unreachable() +# elif defined(_MSC_VER) +# define __builtin_assume(condition) __assume(condition) +# else +# define __builtin_assume(condition) (void)(condition) +# endif +#endif + +static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n+1], carry; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (carry=0, j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + limbx = tmp[i] + (hi + (llimb_t)carry); + tmp[i-1] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + limbx = hi + (llimb_t)carry; + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + for (carry=0, i=0; i> LIMB_T_BITS); + } +} + +#define SUB_MOD_IMPL(bits) \ +inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits b, const vec##bits p) \ +{ sub_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_MOD_IMPL(256) +SUB_MOD_IMPL(384) + +static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[], + size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t mask, carry, borrow, tmp[n], two_a[n]; + size_t i; + + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS) & 1; + } + + flag &= vec_is_zero(a, sizeof(tmp)) ^ 1; + mask = (limb_t)0 - flag; + + for(i=0; i> LIMB_T_BITS) & 1; + } + + return borrow & (is_zero(acc) ^ 1); +} + +#define CHECK_MOD_IMPL(bits) \ +inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \ +{ return check_mod_n(a, p, NLIMBS(bits)); } + +CHECK_MOD_IMPL(256) + +static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + add_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define ADD_N_CHECK_MOD_IMPL(bits) \ +inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +ADD_N_CHECK_MOD_IMPL(256) + +static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + sub_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define SUB_N_CHECK_MOD_IMPL(bits) \ +inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_N_CHECK_MOD_IMPL(256) + +static void from_mont_n(limb_t ret[], const limb_t a[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n]; + size_t i, j; + + for (j=0; j> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = hi; + a = tmp; + } + + /* this is needed only if input can be non-fully-reduced */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = hi; + b = tmp; + } + + for (carry=0, i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (next=ret[0], i=0; i> 1; + next = ret[i+1]; + ret[i] = limb | next << (LIMB_T_BITS-1); + } + ret[i] = next >> 1 | carry << (LIMB_T_BITS-1); + + a = ret; + } +} + +#define RSHIFT_MOD_IMPL(bits) \ +inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, count, p, NLIMBS(bits)); } + +RSHIFT_MOD_IMPL(256) +RSHIFT_MOD_IMPL(384) + +#define DIV_BY_2_MOD_IMPL(bits) \ +inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); } + +DIV_BY_2_MOD_IMPL(384) + +static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t carry, borrow, ret, tmp[n]; + size_t i; + + ret = a[0] & 1; /* parity */ + + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + ret |= ((carry - borrow) & 2) ^ 2; + + return ret; +} + +inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p) +{ return sgn0_pty_mod_n(a, p, NLIMBS(384)); } + +inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0) +{ + vec384 tmp; + + from_mont_n(tmp, a, p, n0, NLIMBS(384)); + + return sgn0_pty_mod_n(tmp, p, NLIMBS(384)); +} + +inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p) +{ + limb_t re, im, sign, prty; + + re = sgn0_pty_mod_n(a[0], p, NLIMBS(384)); + im = sgn0_pty_mod_n(a[1], p, NLIMBS(384)); + + /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */ + sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384)); + sign = (re & sign) | (im & ~sign); + + /* a->re==0 ? prty(a->im) : prty(a->re) */ + prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384)); + prty = (im & prty) | (re & ~prty); + + return (sign & 2) | (prty & 1); +} + +inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0) +{ + vec384x tmp; + + from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384)); + from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384)); + + return sgn0_pty_mod_384x(tmp, p); +} + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0) +{ + vec384 aa, bb, cc; + + add_mod_n(aa, a[0], a[1], p, NLIMBS(384)); + add_mod_n(bb, b[0], b[1], p, NLIMBS(384)); + mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384)); + mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384)); + mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384)); + sub_mod_n(ret[0], aa, cc, p, NLIMBS(384)); + sub_mod_n(ret[1], bb, aa, p, NLIMBS(384)); + sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384)); +} + +/* + * mul_mont_n without final conditional subtraction, which implies + * that modulus is one bit short, which in turn means that there are + * no carries to handle between iterations... + */ +static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mx, hi, tmp[n+1]; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = tmp[i] + hi; + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + } + + vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t)); +} + +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b) +{ + __builtin_assume(count != 0); + while(count--) { + mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384)); + a = ret; + } + mul_mont_n(ret, ret, b, p, n0, NLIMBS(384)); +} + +void sqr_mont_382x(vec384x ret, const vec384x a, + const vec384 p, limb_t n0) +{ + llimb_t limbx; + limb_t mask, carry, borrow; + size_t i; + vec384 t0, t1; + + /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + + /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + launder(mask); + + /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384)); + + /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384)); + + /* account for t1's sign... */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + launder(mask); + for (carry=0, i=0; i> LIMB_T_BITS); + } +} + +#if defined(__GNUC__) || defined(__clang__) +# define MSB(x) ({ limb_t ret = (x) >> (LIMB_T_BITS-1); launder(ret); ret; }) +#else +# define MSB(x) ((x) >> (LIMB_T_BITS-1)) +#endif + +static size_t num_bits(limb_t l) +{ + limb_t x, mask; + size_t bits = is_zero(l) ^ 1; + + if (sizeof(limb_t) == 8) { + x = l >> (32 & (8*sizeof(limb_t)-1)); + mask = 0 - MSB(0 - x); + bits += 32 & mask; + l ^= (x ^ l) & mask; + } + + x = l >> 16; + mask = 0 - MSB(0 - x); + bits += 16 & mask; + l ^= (x ^ l) & mask; + + x = l >> 8; + mask = 0 - MSB(0 - x); + bits += 8 & mask; + l ^= (x ^ l) & mask; + + x = l >> 4; + mask = 0 - MSB(0 - x); + bits += 4 & mask; + l ^= (x ^ l) & mask; + + x = l >> 2; + mask = 0 - MSB(0 - x); + bits += 2 & mask; + l ^= (x ^ l) & mask; + + bits += l >> 1; + + return bits; +} + +#if defined(__clang_major__) && __clang_major__>7 +__attribute__((optnone)) +#endif +static limb_t lshift_2(limb_t hi, limb_t lo, size_t l) +{ + size_t r = LIMB_T_BITS - l; + limb_t mask = 0 - (is_zero(l)^1); + return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1))); +} + +/* + * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1. + */ +static void ab_approximation_n(limb_t a_[2], const limb_t a[], + limb_t b_[2], const limb_t b[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + limb_t a_hi, a_lo, b_hi, b_lo, mask; + size_t i; + + i = n-1; + a_hi = a[i], a_lo = a[i-1]; + b_hi = b[i], b_lo = b[i-1]; + for (i--; --i;) { + mask = 0 - is_zero(a_hi | b_hi); + a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; + b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; + a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo; + b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo; + } + i = LIMB_T_BITS - num_bits(a_hi | b_hi); + /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */ + + a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); + b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); +} + +typedef struct { limb_t f0, g0, f1, g1; } factors; + +static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2], + size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; + limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; + + a_lo = a_[0], a_hi = a_[1]; + b_lo = b_[0], b_hi = b_[1]; + + while(n--) { + odd = 0 - (a_lo&1); + + /* a_ -= b_ if a_ is odd */ + t_lo = a_lo, t_hi = a_hi; + limbx = a_lo - (llimb_t)(b_lo & odd); + a_lo = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1; +} + +static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx = 0; + limb_t carry; + size_t i; + + for (carry=neg&1, i=0; i> LIMB_T_BITS); + } + + return 0 - MSB((limb_t)limbx); +} + +static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t carry; + size_t i; + + for (carry=0, i=0; i> LIMB_T_BITS); + } + + return carry; +} + +static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t hi; + size_t i; + + for (hi=0, i=0; i> LIMB_T_BITS); + } + + return hi; +} + +static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_, + const limb_t b[], limb_t *g_, + size_t n) +{ + __builtin_assume(n != 0); + limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi; + size_t i; + + /* |a|*|f_| */ + f = *f_; + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + (void)cneg_n(a_, a, neg, n); + hi = umul_n(a_, a_, f, n); + a_[n] = hi - (f & neg); + + /* |b|*|g_| */ + g = *g_; + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + (void)cneg_n(b_, b, neg, n); + hi = umul_n(b_, b_, g, n); + b_[n] = hi - (g & neg); + + /* |a|*|f_| + |b|*|g_| */ + (void)add_n(a_, a_, b_, n+1); + + /* (|a|*|f_| + |b|*|g_|) >> k */ + for (carry=a_[0], i=0; i> (LIMB_T_BITS-2); + carry = a_[i+1]; + ret[i] = hi | (carry << 2); + } + + /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ + neg = 0 - MSB(carry); + *f_ = (*f_ ^ neg) - neg; + *g_ = (*g_ ^ neg) - neg; + (void)cneg_n(ret, ret, neg, n); + + return neg; +} + +static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f, + const limb_t v[], limb_t g, size_t n) +{ + __builtin_assume(n != 0); + limb_t u_[n], v_[n], neg, hi; + + /* |u|*|f_| */ + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + neg = cneg_n(u_, u, neg, n); + hi = umul_n(u_, u_, f, n) - (f&neg); + + /* |v|*|g_| */ + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + neg = cneg_n(v_, v, neg, n); + hi += umul_n(v_, v_, g, n) - (g&neg); + + /* |u|*|f_| + |v|*|g_| */ + hi += add_n(ret, u_, v_, n); + + return hi; +} + +static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[], + const limb_t mod[], const limb_t modx[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t a[n], b[n], u[2*n], v[2*n], t[2*n]; + limb_t a_[2], b_[2], sign, carry, top; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + vec_zero(u, sizeof(u)); u[0] = 1; + vec_zero(v, sizeof(v)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2); + (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + smul_2n(t, u, fg.f0, v, fg.g0, 2*n); + smul_2n(v, u, fg.f1, v, fg.g1, 2*n); + vec_copy(u, t, sizeof(u)); + } + + inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n); + + sign = 0 - MSB(top); /* top is 1, 0 or -1 */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + top += carry; + sign = 0 - top; /* top is 1, 0 or -1 */ + top |= sign; + for (i=0; i> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + L += ((t_lo & b_lo) >> 1) & borrow; + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + + L += (b_lo + 2) >> 2; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1; + + return L; +} + +static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + limb_t a[n], b[n], t[n]; + limb_t a_[2], b_[2], neg, L = 0; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2); + neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + L += (b[0] >> 1) & neg; + } + + L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + + return (L & 1) ^ 1; +} + +#define CT_IS_SQR_MOD_IMPL(bits) \ +inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \ + const vec##bits mod) \ +{ return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); } + +CT_IS_SQR_MOD_IMPL(384) + +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi) +{ + llimb_t Rx; + limb_t r_lo = div_top[0], r_hi = div_top[1]; + limb_t Q = 0, mask, borrow, rx; + size_t i; + + for (i = 0; i < LIMB_T_BITS; i++) { + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS); + + /* "if (R >= D) R -= D" */ + r_lo = ((r_lo ^ rx) & borrow) ^ rx; + rx = (limb_t)Rx; + r_hi = ((r_hi ^ rx) & borrow) ^ rx; + + Q <<= 1; + Q |= ~borrow & 1; + + /* "D >>= 1" */ + d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1); + d_hi >>= 1; + } + + mask = 0 - MSB(Q); /* does it overflow? */ + + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + + Q <<= 1; + Q |= borrow ^ 1; + + return (Q | mask); +} + +static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor, + limb_t quotient, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t tmp[n+1], carry, mask, borrow; + size_t i; + + /* divisor*quotient */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + tmp[i] = carry; + + /* remainder = dividend - divisor*quotient */ + for (borrow=0, i=0; i<=n; i++) { + limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow); + tmp[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + /* if quotient was off by one, add divisor to the remainder */ + for (carry=0, i=0; i> LIMB_T_BITS) & 1; + } + + return (div_rem[i] = quotient + mask); +} + +inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); } + +inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); } + +/* + * Unlock reference implementations in vect.c + */ +#define mul_by_8_mod_384 mul_by_8_mod_384 +#define mul_by_8_mod_384x mul_by_8_mod_384x +#define mul_by_3_mod_384x mul_by_3_mod_384x +#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x +#define add_mod_384x add_mod_384x +#define sub_mod_384x sub_mod_384x +#define lshift_mod_384x lshift_mod_384x +#define sqr_mont_384x sqr_mont_384x + +inline void vec_prefetch(const void *ptr, size_t len) +{ (void)ptr; (void)len; } + +/* + * SHA-256 + */ +#define ROTR(x,n) ((x)>>n | (x)<<(32-n)) +#define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22)) +#define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25)) +#define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3)) +#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10)) +#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +void blst_sha256_block_data_order(unsigned int *v, const void *inp, + size_t blocks) +{ + static const unsigned int K256[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2; + const unsigned char *data = inp; + size_t round; + + a = v[0]; + b = v[1]; + c = v[2]; + d = v[3]; + e = v[4]; + f = v[5]; + g = v[6]; + h = v[7]; + + while (blocks--) { + for (round = 0; round < 16; round++) { + l = (unsigned int)data[0] << 24; + l |= (unsigned int)data[1] << 16; + l |= (unsigned int)data[2] << 8; + l |= (unsigned int)data[3]; + data += 4; + T1 = X[round] = l; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + for (; round < 64; round++) { + s0 = X[(round + 1) & 0x0f]; + s0 = sigma0(s0); + s1 = X[(round + 14) & 0x0f]; + s1 = sigma1(s1); + + T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf]; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + a += v[0]; v[0] = a; + b += v[1]; v[1] = b; + c += v[2]; v[2] = c; + d += v[3]; v[3] = d; + e += v[4]; v[4] = e; + f += v[5]; v[5] = f; + g += v[6]; v[6] = g; + h += v[7]; v[7] = h; + } +} +#undef ROTR +#undef Sigma0 +#undef Sigma1 +#undef sigma0 +#undef sigma1 +#undef Ch +#undef Maj + +void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8]) +{ + size_t i; + + for (i=0; i<8; i++) + dst[i] = src[i]; +} + +void blst_sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + size_t i; + + for (i=0; i<8; i++, md+=4) { + unsigned int h_i = h[i]; + md[0] = (unsigned char)(h_i >> 24); + md[1] = (unsigned char)(h_i >> 16); + md[2] = (unsigned char)(h_i >> 8); + md[3] = (unsigned char)h_i; + } +} + +void blst_sha256_bcopy(void *dst_, const void *src_, size_t len) +{ + unsigned char *dst = dst_; + const unsigned char *src = src_; + size_t i; + + for (i=0; iZ); /* Z1Z1 = Z1^2 */ + mul_fp2(U2, Q->X, Z1Z1); /* U2 = X2*Z1Z1 */ + + mul_fp2(S2, Q->Y, R->Z); + mul_fp2(S2, S2, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */ + + sub_fp2(H, U2, R->X); /* H = U2-X1 */ + + sqr_fp2(HH, H); /* HH = H^2 */ + add_fp2(I, HH, HH); + add_fp2(I, I, I); /* I = 4*HH */ + + mul_fp2(J, H, I); /* J = H*I */ + + sub_fp2(r, S2, R->Y); + add_fp2(r, r, r); /* r = 2*(S2-Y1) */ + + mul_fp2(V, R->X, I); /* V = X1*I */ + + sqr_fp2(T->X, r); + sub_fp2(T->X, T->X, J); + sub_fp2(T->X, T->X, V); + sub_fp2(T->X, T->X, V); /* X3 = r^2-J-2*V */ + + mul_fp2(J, J, R->Y); + sub_fp2(T->Y, V, T->X); + mul_fp2(T->Y, T->Y, r); + sub_fp2(T->Y, T->Y, J); + sub_fp2(T->Y, T->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */ + + add_fp2(T->Z, R->Z, H); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, Z1Z1); + sub_fp2(T->Z, T->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */ + + /* + * line evaluation + */ + mul_fp2(I, r, Q->X); + mul_fp2(J, Q->Y, T->Z); + sub_fp2(I, I, J); + add_fp2(line[0], I, I); /* 2*(r*X2 - Y2*Z3) */ +#ifdef r +# undef r +#else + vec_copy(line[1], r, sizeof(r)); +#endif + vec_copy(line[2], T->Z, sizeof(T->Z)); +} + +static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q) +{ + vec384x ZZ, A, B, C, D, E, F; + + /* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr + */ + sqr_fp2(A, Q->X); /* A = X1^2 */ + sqr_fp2(B, Q->Y); /* B = Y1^2 */ + sqr_fp2(ZZ, Q->Z); /* ZZ = Z1^2 */ + sqr_fp2(C, B); /* C = B^2 */ + + add_fp2(D, Q->X, B); /* X1+B */ + sqr_fp2(D, D); /* (X1+B)^2 */ + sub_fp2(D, D, A); /* (X1+B)^2-A */ + sub_fp2(D, D, C); /* (X1+B)^2-A-C */ + add_fp2(D, D, D); /* D = 2*((X1+B)^2-A-C) */ + + mul_by_3_fp2(E, A); /* E = 3*A */ + sqr_fp2(F, E); /* F = E^2 */ + + add_fp2(line[0], E, Q->X); /* 3*A+X1 for line evaluation */ + + sub_fp2(T->X, F, D); + sub_fp2(T->X, T->X, D); /* X3 = F-2*D */ + + add_fp2(T->Z, Q->Y, Q->Z); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, B); + sub_fp2(T->Z, T->Z, ZZ); /* Z3 = (Y1+Z1)^2-B-ZZ */ + + mul_by_8_fp2(C, C); /* 8*C */ + sub_fp2(T->Y, D, T->X); /* D-X3 */ + mul_fp2(T->Y, T->Y, E); /* E*(D-X3) */ + sub_fp2(T->Y, T->Y, C); /* Y3 = E*(D-X3)-8*C */ + + /* + * line evaluation + */ + sqr_fp2(line[0], line[0]); + sub_fp2(line[0], line[0], A); + sub_fp2(line[0], line[0], F); /* (3*A+X1)^2 - X1^2 - 9*A^2 */ + lshift_fp2(B, B, 2); + sub_fp2(line[0], line[0], B); /* 6*X1^3 - 4*Y1^2 */ + + mul_fp2(line[1], E, ZZ); /* 3*X1^2 * Z1^2 */ + + mul_fp2(line[2], T->Z, ZZ); /* Z3 * Z1^2 */ +} + +static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2) +{ + mul_fp(line[1][0], line[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(line[1][1], line[1][1], Px2->X); + + mul_fp(line[2][0], line[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(line[2][1], line[2][1], Px2->Y); +} + +#if 0 +static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q, + const POINTonE1_affine *Px2, vec384fp6 line, size_t n) +{ + line_add(line, T, T, Q); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + line_dbl(line, T, T); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P) +{ +#define Q ((const POINTonE2_affine *)Q) + POINTonE2 T[1]; + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T, T); /* 0x2 */ + line_by_Px2(line, Px2); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + add_n_dbl(ret, T, Q, Px2, line, 2); /* ..0xc */ + add_n_dbl(ret, T, Q, Px2, line, 3); /* ..0x68 */ + add_n_dbl(ret, T, Q, Px2, line, 9); /* ..0xd200 */ + add_n_dbl(ret, T, Q, Px2, line, 32); /* ..0xd20100000000 */ + add_n_dbl(ret, T, Q, Px2, line, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +#undef Q +} +#endif + +static void start_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE1_affine Px2[], size_t n) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T+0, T+0); line_by_Px2(line, Px2+0); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + + for (i = 1; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE2_affine Q[], + const POINTonE1_affine Px2[], + size_t n, size_t k) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + for (i = 0; i < n; i++) { + line_add(line, T+i, T+i, Q+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + while (k--) { + sqr_fp12(ret, ret); + for (i = 0; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + } +} + +static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[], + const POINTonE1_affine P[], size_t n) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + POINTonE2 *T = alloca(n*sizeof(POINTonE2)); + POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine)); +#else + POINTonE2 T[n]; + POINTonE1_affine Px2[n]; +#endif + size_t i; + + if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) | + vec_is_zero(&P[0], sizeof(P[0]))) ) { + /* + * Special case of infinite aggregated signature, pair the additive + * group's identity with the multiplicative group's identity. + */ + vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12)); + return; + } + + for (i = 0; i < n; i++) { + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, P[i].X, P[i].X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, P[i].Y, P[i].Y); + + vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + } + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + start_dbl_n(ret, T, Px2, n); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, n, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, n, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, n, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, n, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, n, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T, + const POINTonE2_affine *Q, + size_t n) +{ + line_add(lines++[0], T, T, Q); + while (n--) + line_dbl(lines++[0], T, T); +} + +static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ + POINTonE2 T[1]; + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + line_dbl(Qlines[0], T, T); /* 0x2 */ + pre_add_n_dbl(&Qlines[1], T, Q, 2); /* ..0xc */ + pre_add_n_dbl(&Qlines[4], T, Q, 3); /* ..0x68 */ + pre_add_n_dbl(&Qlines[8], T, Q, 9); /* ..0xd200 */ + pre_add_n_dbl(&Qlines[18], T, Q, 32); /* ..0xd20100000000 */ + pre_add_n_dbl(&Qlines[51], T, Q, 16); /* ..0xd201000000010000 */ +} + +static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in, + const POINTonE1_affine *Px2) +{ + vec_copy(out[0], in[0], sizeof(out[0])); + + mul_fp(out[1][0], in[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(out[1][1], in[1][1], Px2->X); + + mul_fp(out[2][0], in[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(out[2][1], in[2][1], Px2->Y); +} + +static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[], + const POINTonE1_affine *Px2, size_t n) +{ + vec384fp6 line; + + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + post_line_by_Px2(line, Qlines[0], Px2); /* 0x2 */ + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + post_add_n_dbl(ret, &Qlines[1], Px2, 2); /* ..0xc */ + post_add_n_dbl(ret, &Qlines[4], Px2, 3); /* ..0x68 */ + post_add_n_dbl(ret, &Qlines[8], Px2, 9); /* ..0xd200 */ + post_add_n_dbl(ret, &Qlines[18], Px2, 32); /* ..0xd20100000000 */ + post_add_n_dbl(ret, &Qlines[51], Px2, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#ifdef INTERNAL_TESTMODE +static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ + vec384fp6 lines[68]; + + precompute_lines(lines, Q); + miller_loop_lines(ret, lines, P); +} +#endif + +static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + mul_fp12(ret, ret, a); + while (n--) + cyclotomic_sqr_fp12(ret, ret); +} + +static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a) +{ + cyclotomic_sqr_fp12(ret, a); /* 0x2 */ + mul_n_sqr(ret, a, 2); /* ..0xc */ + mul_n_sqr(ret, a, 3); /* ..0x68 */ + mul_n_sqr(ret, a, 9); /* ..0xd200 */ + mul_n_sqr(ret, a, 32); /* ..0xd20100000000 */ + mul_n_sqr(ret, a, 16-1); /* ..0x6900800000008000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a)) + +/* + * Adaptation from /pairing/src/bls12_381/mod.rs + */ +static void final_exp(vec384fp12 ret, const vec384fp12 f) +{ + vec384fp12 y0, y1, y2, y3; + + vec_copy(y1, f, sizeof(y1)); + conjugate_fp12(y1); + inverse_fp12(y2, f); + mul_fp12(ret, y1, y2); + frobenius_map_fp12(y2, ret, 2); + mul_fp12(ret, ret, y2); + + cyclotomic_sqr_fp12(y0, ret); + raise_to_z(y1, y0); + raise_to_z_div_by_2(y2, y1); + vec_copy(y3, ret, sizeof(y3)); + conjugate_fp12(y3); + mul_fp12(y1, y1, y3); + conjugate_fp12(y1); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y1); + raise_to_z(y3, y2); + conjugate_fp12(y1); + mul_fp12(y3, y3, y1); + conjugate_fp12(y1); + frobenius_map_fp12(y1, y1, 3); + frobenius_map_fp12(y2, y2, 2); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y3); + mul_fp12(y2, y2, y0); + mul_fp12(y2, y2, ret); + mul_fp12(y1, y1, y2); + frobenius_map_fp12(y2, y3, 1); + mul_fp12(ret, y1, y2); +} + +void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2, + P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1); +} + +void blst_final_exp(vec384fp12 ret, const vec384fp12 f) +{ final_exp(ret, f); } + +void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ precompute_lines(Qlines, Q); } + +void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ miller_loop_lines(ret, Qlines, P); } + +static bool_t is_cyclotomic(const vec384fp12 f) +{ + vec384fp12 a, b; + + frobenius_map_fp12(a, f, 2); + frobenius_map_fp12(b, a, 2); + mul_fp12(b, b, f); + + return vec_is_equal(a, b, sizeof(a)); +} + +int blst_fp12_in_group(const vec384fp12 f) +{ + vec384fp12 a, b; + + if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f)) + return 0; + + frobenius_map_fp12(a, f, 1); + raise_to_z(b, f); + + return (int)vec_is_equal(a, b, sizeof(a)); +} diff --git a/crypto/blst_src/pentaroot-addchain.h b/crypto/blst_src/pentaroot-addchain.h new file mode 100644 index 00000000000..5bdd9ddf7f7 --- /dev/null +++ b/crypto/blst_src/pentaroot-addchain.h @@ -0,0 +1,333 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is 1/5 modulo BLS12_381_r-1. Exponentiation to which + * yields 5th root of the base. + * + * Generated with 'addchain 20974350070050476191779096203274386335076221000211055129041463479975432473805' + * https://github.com/kwantam/addchain + * # Bos-Coster (win=4) : 307 (15) + * # Bos-Coster (win=10) : 307 (18) + * # Yacobi : 319 (16) + * # Bos-Coster (win=2) : 319 ( 5) + * # Bos-Coster (win=5) : 306 (19) <<< + * # Bos-Coster (win=7) : 311 (22) + * # Bos-Coster (win=9) : 313 (20) + * # Bos-Coster (win=3) : 314 ( 9) + * # Bos-Coster (win=6) : 309 (21) + * # Bos-Coster (win=8) : 309 (23) + * # Bergeron-Berstel-Brlek-Duboc : 334 ( 5) + */ + +#define PENTAROOT_MOD_BLS12_381_r(out, inp, ptype) do { \ +ptype t[19]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[7], t[1]); /* 1: 2 */\ +sqr(t[0], t[7]); /* 2: 4 */\ +sqr(t[2], t[0]); /* 3: 8 */\ +mul(t[10], t[2], t[1]); /* 4: 9 */\ +mul(t[3], t[10], t[7]); /* 5: b */\ +mul(t[1], t[10], t[0]); /* 6: d */\ +mul(t[5], t[3], t[0]); /* 7: f */\ +mul(t[9], t[10], t[2]); /* 8: 11 */\ +mul(t[4], t[3], t[2]); /* 9: 13 */\ +mul(t[15], t[5], t[2]); /* 10: 17 */\ +mul(t[8], t[15], t[2]); /* 11: 1f */\ +mul(t[13], t[8], t[7]); /* 12: 21 */\ +mul(t[14], t[8], t[0]); /* 13: 23 */\ +mul(t[12], t[13], t[0]); /* 14: 25 */\ +mul(t[6], t[8], t[2]); /* 15: 27 */\ +mul(t[11], t[14], t[2]); /* 16: 2b */\ +sqr(t[0], t[15]); /* 17: 2e */\ +mul(t[18], t[6], t[2]); /* 18: 2f */\ +mul(t[2], t[11], t[2]); /* 19: 33 */\ +mul(t[16], t[2], t[7]); /* 20: 35 */\ +mul(t[7], t[0], t[3]); /* 21: 39 */\ +mul(t[17], t[0], t[5]); /* 22: 3d */\ +/* sqr(t[0], t[0]); */ /* 23: 5c */\ +/* sqr(t[0], t[0]); */ /* 24: b8 */\ +/* sqr(t[0], t[0]); */ /* 25: 170 */\ +/* sqr(t[0], t[0]); */ /* 26: 2e0 */\ +/* sqr(t[0], t[0]); */ /* 27: 5c0 */\ +/* sqr(t[0], t[0]); */ /* 28: b80 */\ +/* sqr(t[0], t[0]); */ /* 29: 1700 */\ +sqr_n_mul(t[0], t[0], 7, t[18]); /* 30: 172f */\ +/* sqr(t[0], t[0]); */ /* 31: 2e5e */\ +/* sqr(t[0], t[0]); */ /* 32: 5cbc */\ +/* sqr(t[0], t[0]); */ /* 33: b978 */\ +/* sqr(t[0], t[0]); */ /* 34: 172f0 */\ +/* sqr(t[0], t[0]); */ /* 35: 2e5e0 */\ +/* sqr(t[0], t[0]); */ /* 36: 5cbc0 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 37: 5cbe1 */\ +/* sqr(t[0], t[0]); */ /* 38: b97c2 */\ +/* sqr(t[0], t[0]); */ /* 39: 172f84 */\ +/* sqr(t[0], t[0]); */ /* 40: 2e5f08 */\ +/* sqr(t[0], t[0]); */ /* 41: 5cbe10 */\ +/* sqr(t[0], t[0]); */ /* 42: b97c20 */\ +/* sqr(t[0], t[0]); */ /* 43: 172f840 */\ +sqr_n_mul(t[0], t[0], 6, t[17]); /* 44: 172f87d */\ +/* sqr(t[0], t[0]); */ /* 45: 2e5f0fa */\ +/* sqr(t[0], t[0]); */ /* 46: 5cbe1f4 */\ +/* sqr(t[0], t[0]); */ /* 47: b97c3e8 */\ +/* sqr(t[0], t[0]); */ /* 48: 172f87d0 */\ +/* sqr(t[0], t[0]); */ /* 49: 2e5f0fa0 */\ +/* sqr(t[0], t[0]); */ /* 50: 5cbe1f40 */\ +sqr_n_mul(t[0], t[0], 6, t[16]); /* 51: 5cbe1f75 */\ +/* sqr(t[0], t[0]); */ /* 52: b97c3eea */\ +/* sqr(t[0], t[0]); */ /* 53: 172f87dd4 */\ +/* sqr(t[0], t[0]); */ /* 54: 2e5f0fba8 */\ +/* sqr(t[0], t[0]); */ /* 55: 5cbe1f750 */\ +/* sqr(t[0], t[0]); */ /* 56: b97c3eea0 */\ +sqr_n_mul(t[0], t[0], 5, t[15]); /* 57: b97c3eeb7 */\ +/* sqr(t[0], t[0]); */ /* 58: 172f87dd6e */\ +/* sqr(t[0], t[0]); */ /* 59: 2e5f0fbadc */\ +/* sqr(t[0], t[0]); */ /* 60: 5cbe1f75b8 */\ +/* sqr(t[0], t[0]); */ /* 61: b97c3eeb70 */\ +/* sqr(t[0], t[0]); */ /* 62: 172f87dd6e0 */\ +/* sqr(t[0], t[0]); */ /* 63: 2e5f0fbadc0 */\ +sqr_n_mul(t[0], t[0], 6, t[15]); /* 64: 2e5f0fbadd7 */\ +/* sqr(t[0], t[0]); */ /* 65: 5cbe1f75bae */\ +/* sqr(t[0], t[0]); */ /* 66: b97c3eeb75c */\ +/* sqr(t[0], t[0]); */ /* 67: 172f87dd6eb8 */\ +/* sqr(t[0], t[0]); */ /* 68: 2e5f0fbadd70 */\ +/* sqr(t[0], t[0]); */ /* 69: 5cbe1f75bae0 */\ +/* sqr(t[0], t[0]); */ /* 70: b97c3eeb75c0 */\ +/* sqr(t[0], t[0]); */ /* 71: 172f87dd6eb80 */\ +/* sqr(t[0], t[0]); */ /* 72: 2e5f0fbadd700 */\ +sqr_n_mul(t[0], t[0], 8, t[14]); /* 73: 2e5f0fbadd723 */\ +/* sqr(t[0], t[0]); */ /* 74: 5cbe1f75bae46 */\ +/* sqr(t[0], t[0]); */ /* 75: b97c3eeb75c8c */\ +/* sqr(t[0], t[0]); */ /* 76: 172f87dd6eb918 */\ +/* sqr(t[0], t[0]); */ /* 77: 2e5f0fbadd7230 */\ +/* sqr(t[0], t[0]); */ /* 78: 5cbe1f75bae460 */\ +/* sqr(t[0], t[0]); */ /* 79: b97c3eeb75c8c0 */\ +/* sqr(t[0], t[0]); */ /* 80: 172f87dd6eb9180 */\ +/* sqr(t[0], t[0]); */ /* 81: 2e5f0fbadd72300 */\ +sqr_n_mul(t[0], t[0], 8, t[13]); /* 82: 2e5f0fbadd72321 */\ +/* sqr(t[0], t[0]); */ /* 83: 5cbe1f75bae4642 */\ +/* sqr(t[0], t[0]); */ /* 84: b97c3eeb75c8c84 */\ +/* sqr(t[0], t[0]); */ /* 85: 172f87dd6eb91908 */\ +/* sqr(t[0], t[0]); */ /* 86: 2e5f0fbadd723210 */\ +/* sqr(t[0], t[0]); */ /* 87: 5cbe1f75bae46420 */\ +/* sqr(t[0], t[0]); */ /* 88: b97c3eeb75c8c840 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 89: b97c3eeb75c8c873 */\ +/* sqr(t[0], t[0]); */ /* 90: 172f87dd6eb9190e6 */\ +/* sqr(t[0], t[0]); */ /* 91: 2e5f0fbadd72321cc */\ +/* sqr(t[0], t[0]); */ /* 92: 5cbe1f75bae464398 */\ +/* sqr(t[0], t[0]); */ /* 93: b97c3eeb75c8c8730 */\ +/* sqr(t[0], t[0]); */ /* 94: 172f87dd6eb9190e60 */\ +/* sqr(t[0], t[0]); */ /* 95: 2e5f0fbadd72321cc0 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 96: 2e5f0fbadd72321ce1 */\ +/* sqr(t[0], t[0]); */ /* 97: 5cbe1f75bae46439c2 */\ +/* sqr(t[0], t[0]); */ /* 98: b97c3eeb75c8c87384 */\ +/* sqr(t[0], t[0]); */ /* 99: 172f87dd6eb9190e708 */\ +/* sqr(t[0], t[0]); */ /* 100: 2e5f0fbadd72321ce10 */\ +/* sqr(t[0], t[0]); */ /* 101: 5cbe1f75bae46439c20 */\ +/* sqr(t[0], t[0]); */ /* 102: b97c3eeb75c8c873840 */\ +/* sqr(t[0], t[0]); */ /* 103: 172f87dd6eb9190e7080 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 104: 172f87dd6eb9190e70a5 */\ +/* sqr(t[0], t[0]); */ /* 105: 2e5f0fbadd72321ce14a */\ +/* sqr(t[0], t[0]); */ /* 106: 5cbe1f75bae46439c294 */\ +/* sqr(t[0], t[0]); */ /* 107: b97c3eeb75c8c8738528 */\ +/* sqr(t[0], t[0]); */ /* 108: 172f87dd6eb9190e70a50 */\ +/* sqr(t[0], t[0]); */ /* 109: 2e5f0fbadd72321ce14a0 */\ +/* sqr(t[0], t[0]); */ /* 110: 5cbe1f75bae46439c2940 */\ +/* sqr(t[0], t[0]); */ /* 111: b97c3eeb75c8c87385280 */\ +/* sqr(t[0], t[0]); */ /* 112: 172f87dd6eb9190e70a500 */\ +sqr_n_mul(t[0], t[0], 8, t[11]); /* 113: 172f87dd6eb9190e70a52b */\ +/* sqr(t[0], t[0]); */ /* 114: 2e5f0fbadd72321ce14a56 */\ +/* sqr(t[0], t[0]); */ /* 115: 5cbe1f75bae46439c294ac */\ +/* sqr(t[0], t[0]); */ /* 116: b97c3eeb75c8c873852958 */\ +/* sqr(t[0], t[0]); */ /* 117: 172f87dd6eb9190e70a52b0 */\ +/* sqr(t[0], t[0]); */ /* 118: 2e5f0fbadd72321ce14a560 */\ +/* sqr(t[0], t[0]); */ /* 119: 5cbe1f75bae46439c294ac0 */\ +sqr_n_mul(t[0], t[0], 6, t[1]); /* 120: 5cbe1f75bae46439c294acd */\ +/* sqr(t[0], t[0]); */ /* 121: b97c3eeb75c8c873852959a */\ +/* sqr(t[0], t[0]); */ /* 122: 172f87dd6eb9190e70a52b34 */\ +/* sqr(t[0], t[0]); */ /* 123: 2e5f0fbadd72321ce14a5668 */\ +/* sqr(t[0], t[0]); */ /* 124: 5cbe1f75bae46439c294acd0 */\ +/* sqr(t[0], t[0]); */ /* 125: b97c3eeb75c8c873852959a0 */\ +/* sqr(t[0], t[0]); */ /* 126: 172f87dd6eb9190e70a52b340 */\ +/* sqr(t[0], t[0]); */ /* 127: 2e5f0fbadd72321ce14a56680 */\ +/* sqr(t[0], t[0]); */ /* 128: 5cbe1f75bae46439c294acd00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 129: 5cbe1f75bae46439c294acd33 */\ +/* sqr(t[0], t[0]); */ /* 130: b97c3eeb75c8c873852959a66 */\ +/* sqr(t[0], t[0]); */ /* 131: 172f87dd6eb9190e70a52b34cc */\ +/* sqr(t[0], t[0]); */ /* 132: 2e5f0fbadd72321ce14a566998 */\ +/* sqr(t[0], t[0]); */ /* 133: 5cbe1f75bae46439c294acd330 */\ +/* sqr(t[0], t[0]); */ /* 134: b97c3eeb75c8c873852959a660 */\ +/* sqr(t[0], t[0]); */ /* 135: 172f87dd6eb9190e70a52b34cc0 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 136: 172f87dd6eb9190e70a52b34ceb */\ +/* sqr(t[0], t[0]); */ /* 137: 2e5f0fbadd72321ce14a56699d6 */\ +/* sqr(t[0], t[0]); */ /* 138: 5cbe1f75bae46439c294acd33ac */\ +/* sqr(t[0], t[0]); */ /* 139: b97c3eeb75c8c873852959a6758 */\ +/* sqr(t[0], t[0]); */ /* 140: 172f87dd6eb9190e70a52b34ceb0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 141: 172f87dd6eb9190e70a52b34ceb9 */\ +/* sqr(t[0], t[0]); */ /* 142: 2e5f0fbadd72321ce14a56699d72 */\ +/* sqr(t[0], t[0]); */ /* 143: 5cbe1f75bae46439c294acd33ae4 */\ +/* sqr(t[0], t[0]); */ /* 144: b97c3eeb75c8c873852959a675c8 */\ +/* sqr(t[0], t[0]); */ /* 145: 172f87dd6eb9190e70a52b34ceb90 */\ +/* sqr(t[0], t[0]); */ /* 146: 2e5f0fbadd72321ce14a56699d720 */\ +sqr_n_mul(t[0], t[0], 5, t[8]); /* 147: 2e5f0fbadd72321ce14a56699d73f */\ +/* sqr(t[0], t[0]); */ /* 148: 5cbe1f75bae46439c294acd33ae7e */\ +/* sqr(t[0], t[0]); */ /* 149: b97c3eeb75c8c873852959a675cfc */\ +/* sqr(t[0], t[0]); */ /* 150: 172f87dd6eb9190e70a52b34ceb9f8 */\ +/* sqr(t[0], t[0]); */ /* 151: 2e5f0fbadd72321ce14a56699d73f0 */\ +/* sqr(t[0], t[0]); */ /* 152: 5cbe1f75bae46439c294acd33ae7e0 */\ +/* sqr(t[0], t[0]); */ /* 153: b97c3eeb75c8c873852959a675cfc0 */\ +/* sqr(t[0], t[0]); */ /* 154: 172f87dd6eb9190e70a52b34ceb9f80 */\ +/* sqr(t[0], t[0]); */ /* 155: 2e5f0fbadd72321ce14a56699d73f00 */\ +/* sqr(t[0], t[0]); */ /* 156: 5cbe1f75bae46439c294acd33ae7e00 */\ +/* sqr(t[0], t[0]); */ /* 157: b97c3eeb75c8c873852959a675cfc00 */\ +/* sqr(t[0], t[0]); */ /* 158: 172f87dd6eb9190e70a52b34ceb9f800 */\ +/* sqr(t[0], t[0]); */ /* 159: 2e5f0fbadd72321ce14a56699d73f000 */\ +/* sqr(t[0], t[0]); */ /* 160: 5cbe1f75bae46439c294acd33ae7e000 */\ +/* sqr(t[0], t[0]); */ /* 161: b97c3eeb75c8c873852959a675cfc000 */\ +/* sqr(t[0], t[0]); */ /* 162: 172f87dd6eb9190e70a52b34ceb9f8000 */\ +sqr_n_mul(t[0], t[0], 15, t[9]); /* 163: 172f87dd6eb9190e70a52b34ceb9f8011 */\ +/* sqr(t[0], t[0]); */ /* 164: 2e5f0fbadd72321ce14a56699d73f0022 */\ +/* sqr(t[0], t[0]); */ /* 165: 5cbe1f75bae46439c294acd33ae7e0044 */\ +/* sqr(t[0], t[0]); */ /* 166: b97c3eeb75c8c873852959a675cfc0088 */\ +/* sqr(t[0], t[0]); */ /* 167: 172f87dd6eb9190e70a52b34ceb9f80110 */\ +/* sqr(t[0], t[0]); */ /* 168: 2e5f0fbadd72321ce14a56699d73f00220 */\ +/* sqr(t[0], t[0]); */ /* 169: 5cbe1f75bae46439c294acd33ae7e00440 */\ +/* sqr(t[0], t[0]); */ /* 170: b97c3eeb75c8c873852959a675cfc00880 */\ +/* sqr(t[0], t[0]); */ /* 171: 172f87dd6eb9190e70a52b34ceb9f801100 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 172: 172f87dd6eb9190e70a52b34ceb9f80110b */\ +/* sqr(t[0], t[0]); */ /* 173: 2e5f0fbadd72321ce14a56699d73f002216 */\ +/* sqr(t[0], t[0]); */ /* 174: 5cbe1f75bae46439c294acd33ae7e00442c */\ +/* sqr(t[0], t[0]); */ /* 175: b97c3eeb75c8c873852959a675cfc008858 */\ +/* sqr(t[0], t[0]); */ /* 176: 172f87dd6eb9190e70a52b34ceb9f80110b0 */\ +/* sqr(t[0], t[0]); */ /* 177: 2e5f0fbadd72321ce14a56699d73f0022160 */\ +sqr_n_mul(t[0], t[0], 5, t[8]); /* 178: 2e5f0fbadd72321ce14a56699d73f002217f */\ +/* sqr(t[0], t[0]); */ /* 179: 5cbe1f75bae46439c294acd33ae7e00442fe */\ +/* sqr(t[0], t[0]); */ /* 180: b97c3eeb75c8c873852959a675cfc00885fc */\ +/* sqr(t[0], t[0]); */ /* 181: 172f87dd6eb9190e70a52b34ceb9f80110bf8 */\ +/* sqr(t[0], t[0]); */ /* 182: 2e5f0fbadd72321ce14a56699d73f002217f0 */\ +/* sqr(t[0], t[0]); */ /* 183: 5cbe1f75bae46439c294acd33ae7e00442fe0 */\ +/* sqr(t[0], t[0]); */ /* 184: b97c3eeb75c8c873852959a675cfc00885fc0 */\ +/* sqr(t[0], t[0]); */ /* 185: 172f87dd6eb9190e70a52b34ceb9f80110bf80 */\ +/* sqr(t[0], t[0]); */ /* 186: 2e5f0fbadd72321ce14a56699d73f002217f00 */\ +/* sqr(t[0], t[0]); */ /* 187: 5cbe1f75bae46439c294acd33ae7e00442fe00 */\ +/* sqr(t[0], t[0]); */ /* 188: b97c3eeb75c8c873852959a675cfc00885fc00 */\ +sqr_n_mul(t[0], t[0], 10, t[7]); /* 189: b97c3eeb75c8c873852959a675cfc00885fc39 */\ +/* sqr(t[0], t[0]); */ /* 190: 172f87dd6eb9190e70a52b34ceb9f80110bf872 */\ +/* sqr(t[0], t[0]); */ /* 191: 2e5f0fbadd72321ce14a56699d73f002217f0e4 */\ +/* sqr(t[0], t[0]); */ /* 192: 5cbe1f75bae46439c294acd33ae7e00442fe1c8 */\ +/* sqr(t[0], t[0]); */ /* 193: b97c3eeb75c8c873852959a675cfc00885fc390 */\ +/* sqr(t[0], t[0]); */ /* 194: 172f87dd6eb9190e70a52b34ceb9f80110bf8720 */\ +/* sqr(t[0], t[0]); */ /* 195: 2e5f0fbadd72321ce14a56699d73f002217f0e40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 196: 2e5f0fbadd72321ce14a56699d73f002217f0e67 */\ +/* sqr(t[0], t[0]); */ /* 197: 5cbe1f75bae46439c294acd33ae7e00442fe1cce */\ +/* sqr(t[0], t[0]); */ /* 198: b97c3eeb75c8c873852959a675cfc00885fc399c */\ +/* sqr(t[0], t[0]); */ /* 199: 172f87dd6eb9190e70a52b34ceb9f80110bf87338 */\ +/* sqr(t[0], t[0]); */ /* 200: 2e5f0fbadd72321ce14a56699d73f002217f0e670 */\ +/* sqr(t[0], t[0]); */ /* 201: 5cbe1f75bae46439c294acd33ae7e00442fe1cce0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 202: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3 */\ +/* sqr(t[0], t[0]); */ /* 203: b97c3eeb75c8c873852959a675cfc00885fc399e6 */\ +/* sqr(t[0], t[0]); */ /* 204: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc */\ +/* sqr(t[0], t[0]); */ /* 205: 2e5f0fbadd72321ce14a56699d73f002217f0e6798 */\ +/* sqr(t[0], t[0]); */ /* 206: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf30 */\ +/* sqr(t[0], t[0]); */ /* 207: b97c3eeb75c8c873852959a675cfc00885fc399e60 */\ +/* sqr(t[0], t[0]); */ /* 208: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc0 */\ +/* sqr(t[0], t[0]); */ /* 209: 2e5f0fbadd72321ce14a56699d73f002217f0e67980 */\ +/* sqr(t[0], t[0]); */ /* 210: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 211: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf333 */\ +/* sqr(t[0], t[0]); */ /* 212: b97c3eeb75c8c873852959a675cfc00885fc399e666 */\ +/* sqr(t[0], t[0]); */ /* 213: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc */\ +/* sqr(t[0], t[0]); */ /* 214: 2e5f0fbadd72321ce14a56699d73f002217f0e679998 */\ +/* sqr(t[0], t[0]); */ /* 215: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3330 */\ +/* sqr(t[0], t[0]); */ /* 216: b97c3eeb75c8c873852959a675cfc00885fc399e6660 */\ +/* sqr(t[0], t[0]); */ /* 217: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc0 */\ +/* sqr(t[0], t[0]); */ /* 218: 2e5f0fbadd72321ce14a56699d73f002217f0e6799980 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 219: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f */\ +/* sqr(t[0], t[0]); */ /* 220: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e */\ +/* sqr(t[0], t[0]); */ /* 221: b97c3eeb75c8c873852959a675cfc00885fc399e6663c */\ +/* sqr(t[0], t[0]); */ /* 222: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78 */\ +/* sqr(t[0], t[0]); */ /* 223: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f0 */\ +/* sqr(t[0], t[0]); */ /* 224: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e0 */\ +/* sqr(t[0], t[0]); */ /* 225: b97c3eeb75c8c873852959a675cfc00885fc399e6663c0 */\ +/* sqr(t[0], t[0]); */ /* 226: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc780 */\ +/* sqr(t[0], t[0]); */ /* 227: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f00 */\ +/* sqr(t[0], t[0]); */ /* 228: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e00 */\ +sqr_n_mul(t[0], t[0], 9, t[2]); /* 229: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33 */\ +/* sqr(t[0], t[0]); */ /* 230: b97c3eeb75c8c873852959a675cfc00885fc399e6663c66 */\ +/* sqr(t[0], t[0]); */ /* 231: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc */\ +/* sqr(t[0], t[0]); */ /* 232: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f198 */\ +/* sqr(t[0], t[0]); */ /* 233: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e330 */\ +/* sqr(t[0], t[0]); */ /* 234: b97c3eeb75c8c873852959a675cfc00885fc399e6663c660 */\ +/* sqr(t[0], t[0]); */ /* 235: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc0 */\ +/* sqr(t[0], t[0]); */ /* 236: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1980 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 237: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993 */\ +/* sqr(t[0], t[0]); */ /* 238: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326 */\ +/* sqr(t[0], t[0]); */ /* 239: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c */\ +/* sqr(t[0], t[0]); */ /* 240: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc98 */\ +/* sqr(t[0], t[0]); */ /* 241: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19930 */\ +/* sqr(t[0], t[0]); */ /* 242: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33260 */\ +/* sqr(t[0], t[0]); */ /* 243: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c0 */\ +/* sqr(t[0], t[0]); */ /* 244: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc980 */\ +/* sqr(t[0], t[0]); */ /* 245: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 246: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333 */\ +/* sqr(t[0], t[0]); */ /* 247: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666 */\ +/* sqr(t[0], t[0]); */ /* 248: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc */\ +/* sqr(t[0], t[0]); */ /* 249: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9998 */\ +/* sqr(t[0], t[0]); */ /* 250: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993330 */\ +/* sqr(t[0], t[0]); */ /* 251: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326660 */\ +/* sqr(t[0], t[0]); */ /* 252: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc0 */\ +/* sqr(t[0], t[0]); */ /* 253: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99980 */\ +/* sqr(t[0], t[0]); */ /* 254: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 255: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333 */\ +/* sqr(t[0], t[0]); */ /* 256: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666 */\ +/* sqr(t[0], t[0]); */ /* 257: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc */\ +/* sqr(t[0], t[0]); */ /* 258: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999998 */\ +/* sqr(t[0], t[0]); */ /* 259: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333330 */\ +/* sqr(t[0], t[0]); */ /* 260: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666660 */\ +/* sqr(t[0], t[0]); */ /* 261: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc0 */\ +/* sqr(t[0], t[0]); */ /* 262: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999980 */\ +/* sqr(t[0], t[0]); */ /* 263: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 264: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333333 */\ +/* sqr(t[0], t[0]); */ /* 265: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666 */\ +/* sqr(t[0], t[0]); */ /* 266: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc */\ +/* sqr(t[0], t[0]); */ /* 267: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999998 */\ +/* sqr(t[0], t[0]); */ /* 268: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333330 */\ +/* sqr(t[0], t[0]); */ /* 269: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666660 */\ +/* sqr(t[0], t[0]); */ /* 270: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 271: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb */\ +/* sqr(t[0], t[0]); */ /* 272: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996 */\ +/* sqr(t[0], t[0]); */ /* 273: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c */\ +/* sqr(t[0], t[0]); */ /* 274: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666658 */\ +/* sqr(t[0], t[0]); */ /* 275: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb0 */\ +/* sqr(t[0], t[0]); */ /* 276: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999960 */\ +/* sqr(t[0], t[0]); */ /* 277: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c0 */\ +/* sqr(t[0], t[0]); */ /* 278: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666580 */\ +/* sqr(t[0], t[0]); */ /* 279: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 280: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33 */\ +/* sqr(t[0], t[0]); */ /* 281: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666 */\ +/* sqr(t[0], t[0]); */ /* 282: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc */\ +/* sqr(t[0], t[0]); */ /* 283: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665998 */\ +/* sqr(t[0], t[0]); */ /* 284: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb330 */\ +/* sqr(t[0], t[0]); */ /* 285: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996660 */\ +/* sqr(t[0], t[0]); */ /* 286: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc0 */\ +/* sqr(t[0], t[0]); */ /* 287: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659980 */\ +/* sqr(t[0], t[0]); */ /* 288: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 289: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333 */\ +/* sqr(t[0], t[0]); */ /* 290: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666 */\ +/* sqr(t[0], t[0]); */ /* 291: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc */\ +/* sqr(t[0], t[0]); */ /* 292: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666599998 */\ +/* sqr(t[0], t[0]); */ /* 293: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33330 */\ +/* sqr(t[0], t[0]); */ /* 294: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666660 */\ +/* sqr(t[0], t[0]); */ /* 295: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc0 */\ +/* sqr(t[0], t[0]); */ /* 296: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665999980 */\ +/* sqr(t[0], t[0]); */ /* 297: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 298: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333333 */\ +/* sqr(t[0], t[0]); */ /* 299: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996666666 */\ +/* sqr(t[0], t[0]); */ /* 300: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc */\ +/* sqr(t[0], t[0]); */ /* 301: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659999998 */\ +/* sqr(t[0], t[0]); */ /* 302: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333330 */\ +/* sqr(t[0], t[0]); */ /* 303: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666660 */\ +/* sqr(t[0], t[0]); */ /* 304: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc0 */\ +sqr_n_mul(out, t[0], 6, t[1]); /* 305: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332cccccccd */\ +} while(0) diff --git a/crypto/blst_src/pentaroot.c b/crypto/blst_src/pentaroot.c new file mode 100644 index 00000000000..fd028113f3d --- /dev/null +++ b/crypto/blst_src/pentaroot.c @@ -0,0 +1,76 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +static inline void mul_fr(vec384 ret, const vec384 a, const vec384 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +static inline void sqr_fr(vec384 ret, const vec384 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +#ifdef __OPTIMIZE_SIZE__ +void blst_fr_pentaroot(vec256 out, const vec256 inp) +{ + static const byte pow[] = { + TO_BYTES(0x33333332cccccccd), TO_BYTES(0x217f0e679998f199), + TO_BYTES(0xe14a56699d73f002), TO_BYTES(0x2e5f0fbadd72321c) + }; + size_t pow_bits = 254; + vec256 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_fr(ret, ret); + if (is_bit_set(pow, pow_bits)) + mul_fr(ret, ret, inp); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +} +#else +# if 0 +/* + * "255"-bit variant omits full reductions at the ends of squarings, + * not implemented yet[?]. + */ +static inline void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, + const vec256 b) +{ sqr_n_mul_mont_255(out, a, count, BLS12_381_r, r0, b); } +# else +static void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, + const vec256 b) +{ + do { + sqr_fr(out, a); + a = out; + } while (--count); + mul_fr(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fr(ret,a) +# define mul(ret,a,b) mul_fr(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fr(ret,a,n,b) + +# include "pentaroot-addchain.h" +void blst_fr_pentaroot(vec256 out, const vec256 inp) +{ PENTAROOT_MOD_BLS12_381_r(out, inp, vec256); } +# undef PENTAROOT_MOD_BLS12_381_r + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +void blst_fr_pentapow(vec256 out, const vec256 inp) +{ + vec256 tmp; + + sqr_fr(tmp, inp); + sqr_fr(tmp, tmp); + mul_fr(out, tmp, inp); +} diff --git a/crypto/blst_src/point.h b/crypto/blst_src/point.h new file mode 100644 index 00000000000..0aa7379671f --- /dev/null +++ b/crypto/blst_src/point.h @@ -0,0 +1,62 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_POINT_H__ +#define __BLS12_381_ASM_POINT_H__ + +#include "vect.h" +#include "bytes.h" + +#define DECLARE_POINT(ptype, bits) \ +typedef struct { vec##bits X,Y,Z; } ptype; \ +typedef struct { vec##bits X,Y; } ptype##_affine; \ +\ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4); \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2); \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_double(ptype *out, const ptype *p1); \ +static void ptype##_mult_w5(ptype *out, const ptype *point, \ + const byte *scalar, size_t nbits); \ +static void ptype##_cneg(ptype *p, limb_t cbit); \ +static void ptype##_to_affine(ptype##_affine *out, const ptype *in); \ +static void ptype##_from_Jacobian(ptype *out, const ptype *in); \ +\ +static inline void ptype##_cswap(ptype *restrict a, \ + ptype *restrict b, bool_t cbit) { \ + vec_cswap(a, b, sizeof(ptype), cbit); \ +} \ +static inline void ptype##_ccopy(ptype *restrict a, \ + const ptype *restrict b, bool_t cbit) {\ + vec_select(a, b, a, sizeof(ptype), cbit); \ +} + +#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \ +typedef struct { vec##bits X,Z; } ptype##xz; \ +\ +static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in); \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p); \ +static void ptype##xz_ladder_post(ptype *ret, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1);\ +\ +static inline void ptype##xz_cswap(ptype##xz *restrict a, \ + ptype##xz *restrict b, bool_t cbit) {\ + vec_cswap(a, b, sizeof(ptype##xz), cbit); \ +} + +DECLARE_POINT(POINTonE1, 384) + +DECLARE_POINT(POINTonE2, 384x) + +#ifdef __GNUC__ +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#endif diff --git a/crypto/blst_src/rb_tree.c b/crypto/blst_src/rb_tree.c new file mode 100644 index 00000000000..207becdad18 --- /dev/null +++ b/crypto/blst_src/rb_tree.c @@ -0,0 +1,145 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +/* + * Red-black tree tailored for uniqueness test. Amount of messages to be + * checked is known prior context initialization, implementation is + * insert-only, failure is returned if message is already in the tree. + */ + +struct node { + struct node *leafs[2]; + const void *data; + size_t len_n_colour; /* len<<1 | colour */ +}; + +struct rb_tree { + struct node *root; + size_t n_nodes; + struct node nodes[1]; +}; + +static long bytes_compare(const unsigned char *ptr0, size_t len0, + const unsigned char *ptr1, size_t len1) +{ + size_t i, len = len0len_n_colour &= ~(size_t)1) +#define PAINT_RED(p) ((p)->len_n_colour |= 1) +#define IS_RED(p) ((p)->len_n_colour & 1) + +static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len) +{ + struct node *nodes[8*sizeof(void *)]; /* visited nodes */ + unsigned char dirs[8*sizeof(void *)]; /* taken directions */ + size_t k = 0; /* walked distance */ + struct node *p, *y, *z; + + for (p = tree->root; p != NULL; k++) { + long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1); + + if (cmp == 0) + return 0; /* already in tree, no insertion */ + + /* record the step */ + nodes[k] = p; + p = p->leafs[(dirs[k] = cmp>0)]; + } + + /* allocate new node */ + z = &tree->nodes[tree->n_nodes++]; + z->leafs[0] = z->leafs[1] = NULL; + z->data = data; + z->len_n_colour = len<<1; + PAINT_RED(z); + + /* graft |z| */ + if (k > 0) + nodes[k-1]->leafs[dirs[k-1]] = z; + else + tree->root = z; + + /* re-balance |tree| */ + while (k >= 2 && IS_RED(y = nodes[k-1])) { + size_t ydir = dirs[k-2]; + struct node *x = nodes[k-2], /* |z|'s grandparent */ + *s = x->leafs[ydir^1]; /* |z|'s uncle */ + + if (s != NULL && IS_RED(s)) { + PAINT_RED(x); + PAINT_BLACK(y); + PAINT_BLACK(s); + k -= 2; + } else { + if (dirs[k-1] != ydir) { + /* | | + * x x + * / \ \ + * y s -> z s + * \ / + * z y + * / \ + * ? ? + */ + struct node *t = y; + y = y->leafs[ydir^1]; + t->leafs[ydir^1] = y->leafs[ydir]; + y->leafs[ydir] = t; + } + + /* | | + * x y + * \ / \ + * y s -> z x + * / \ / \ + * z ? ? s + */ + x->leafs[ydir] = y->leafs[ydir^1]; + y->leafs[ydir^1] = x; + + PAINT_RED(x); + PAINT_BLACK(y); + + if (k > 2) + nodes[k-3]->leafs[dirs[k-3]] = y; + else + tree->root = y; + + break; + } + } + + PAINT_BLACK(tree->root); + + return 1; +} + +#undef IS_RED +#undef PAINT_RED +#undef PAINT_BLACK + +size_t blst_uniq_sizeof(size_t n_nodes) +{ return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1); } + +void blst_uniq_init(struct rb_tree *tree) +{ + tree->root = NULL; + tree->n_nodes = 0; +} + +int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len) +{ return (int)rb_tree_insert(tree, data, len); } diff --git a/crypto/blst_src/recip-addchain.h b/crypto/blst_src/recip-addchain.h new file mode 100644 index 00000000000..e4e436a3f09 --- /dev/null +++ b/crypto/blst_src/recip-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is BLS12_381_P-2. Exponentiation to which yields + * reciprocal to input base. + * + * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 461 (16) <<< + * # Bos-Coster (win=3) : 464 ( 9) + * # Bos-Coster (win=8) : 469 (35) + * # Bos-Coster (win=5) : 463 (28) + * # Bos-Coster (win=9) : 467 (32) + * # Bos-Coster (win=7) : 462 (27) + * # Yacobi : 481 (31) + * # Bos-Coster (win=10) : 475 (30) + * # Bos-Coster (win=6) : 463 (32) + * # Bos-Coster (win=2) : 489 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 498 ( 5) + */ + +#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[0], t[1]); /* 1: 2 */\ +mul(t[9], t[0], t[1]); /* 2: 3 */\ +sqr(t[5], t[0]); /* 3: 4 */\ +mul(t[2], t[9], t[0]); /* 4: 5 */\ +mul(t[7], t[5], t[9]); /* 5: 7 */\ +mul(t[10], t[2], t[5]); /* 6: 9 */\ +mul(t[13], t[7], t[5]); /* 7: b */\ +mul(t[4], t[10], t[5]); /* 8: d */\ +mul(t[8], t[13], t[5]); /* 9: f */\ +mul(t[15], t[4], t[5]); /* 10: 11 */\ +mul(t[11], t[8], t[5]); /* 11: 13 */\ +mul(t[3], t[15], t[5]); /* 12: 15 */\ +mul(t[12], t[11], t[5]); /* 13: 17 */\ +sqr(t[0], t[4]); /* 14: 1a */\ +mul(t[14], t[12], t[5]); /* 15: 1b */\ +mul(t[6], t[0], t[9]); /* 16: 1d */\ +mul(t[5], t[0], t[2]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[8]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[8]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[1]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[13]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[6]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[11]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[7]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[12]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[8]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[8]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +/* sqr(t[0], t[0]); */ /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +/* sqr(t[0], t[0]); */ /* 458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\ +/* sqr(t[0], t[0]); */ /* 459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\ +sqr_n_mul(out, t[0], 3, t[1]); /* 460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\ +} while(0) diff --git a/crypto/blst_src/recip.c b/crypto/blst_src/recip.c new file mode 100644 index 00000000000..e0c700635ed --- /dev/null +++ b/crypto/blst_src/recip.c @@ -0,0 +1,139 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +/* + * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32% + * more than corresponding optimal addition-chain, plus mispredicted + * branch penalties on top of that... The addition chain below was + * measured to be >50% faster. + */ +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + static const byte BLS12_381_P_minus_2[] = { + TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff), + TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf), + TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a) + }; + + exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0); +} +#else +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "recip-addchain.h" +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIPROCAL_MOD_BLS12_381_P +# undef sqr_n_mul +# undef mul +# undef sqr +#endif + +static void flt_reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + flt_reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +static void reciprocal_fp(vec384 out, const vec384 inp) +{ + static const vec384 Px8 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd), + TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb), + TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2) + }; +#ifdef __BLST_NO_ASM__ +# define RRx4 BLS12_381_RR +#else + static const vec384 RRx4 = { /* (4<<768)%P */ + TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8), + TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983), + TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175) + }; +#endif + union { vec768 x; vec384 r[2]; } temp; + + ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8); + redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0); + mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0); + +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + /* sign goes straight to flt_reciprocal */ + mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0); + if (vec_is_equal(temp.r[1], BLS12_381_Rx.p, sizeof(vec384)) | + vec_is_zero(temp.r[1], sizeof(vec384))) + vec_copy(out, temp.r[0], sizeof(vec384)); + else + flt_reciprocal_fp(out, inp); +#else + vec_copy(out, temp.r[0], sizeof(vec384)); +#endif +#undef RRx4 +} + +void blst_fp_inverse(vec384 out, const vec384 inp) +{ reciprocal_fp(out, inp); } + +void blst_fp_eucl_inverse(vec384 ret, const vec384 a) +{ reciprocal_fp(ret, a); } + +static void reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +void blst_fp2_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +void blst_fp2_eucl_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +static void reciprocal_fr(vec256 out, const vec256 inp) +{ + static const vec256 rx2 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), + TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), + }; + vec512 temp; + + ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2); + redc_mont_256(out, temp, BLS12_381_r, r0); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_fr_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } + +void blst_fr_eucl_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } diff --git a/crypto/blst_src/server.c b/crypto/blst_src/server.c new file mode 100644 index 00000000000..c124bcec078 --- /dev/null +++ b/crypto/blst_src/server.c @@ -0,0 +1,27 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "hash_to_field.c" +#include "e1.c" +#include "map_to_g1.c" +#include "e2.c" +#include "map_to_g2.c" +#include "fp12_tower.c" +#include "pairing.c" +#include "aggregate.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "bulk_addition.c" +#include "multi_scalar.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" +#include "rb_tree.c" +#ifdef BLST_FR_PENTAROOT +# include "pentaroot.c" +#endif diff --git a/crypto/blst_src/sha256.h b/crypto/blst_src/sha256.h new file mode 100644 index 00000000000..77ddb6dc848 --- /dev/null +++ b/crypto/blst_src/sha256.h @@ -0,0 +1,140 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_SHA256_H__ +#define __BLS12_381_ASM_SHA256_H__ + +#include "vect.h" + +#if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \ + defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_data_order_shaext +#elif defined(__aarch64__) && \ + defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_armv8 +#else +# define sha256_block_data_order blst_sha256_block_data_order +#endif +#define sha256_hcopy blst_sha256_hcopy +#define sha256_bcopy blst_sha256_bcopy +#define sha256_emit blst_sha256_emit + +void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks); +void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]); +void sha256_bcopy(void *dst, const void *src, size_t len); + +/* + * If SHA256_CTX conflicts with something, just redefine it to alternative + * custom name prior including this header. + */ +typedef struct { + unsigned int h[8]; + unsigned long long N; + unsigned char buf[64]; + size_t off; +} SHA256_CTX; + + +static void sha256_init_h(unsigned int h[8]) +{ + h[0] = 0x6a09e667U; + h[1] = 0xbb67ae85U; + h[2] = 0x3c6ef372U; + h[3] = 0xa54ff53aU; + h[4] = 0x510e527fU; + h[5] = 0x9b05688cU; + h[6] = 0x1f83d9abU; + h[7] = 0x5be0cd19U; +} + +static void sha256_init(SHA256_CTX *ctx) +{ + sha256_init_h(ctx->h); + ctx->N = 0; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len) +{ + size_t n; + const unsigned char *inp = _inp; + + ctx->N += len; + + if ((len != 0) & ((n = ctx->off) != 0)) { + size_t rem = sizeof(ctx->buf) - n; + + if (rem > len) { + sha256_bcopy(ctx->buf + n, inp, len); + ctx->off += len; + return; + } else { + sha256_bcopy(ctx->buf + n, inp, rem); + inp += rem; + len -= rem; + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; + } + } + + n = len / sizeof(ctx->buf); + if (n > 0) { + sha256_block_data_order(ctx->h, inp, n); + n *= sizeof(ctx->buf); + inp += n; + len -= n; + } + + if (len) + sha256_bcopy(ctx->buf, inp, ctx->off = len); +} + +#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \ + (ptr)[1] = (unsigned char)((val)>>16), \ + (ptr)[2] = (unsigned char)((val)>>8), \ + (ptr)[3] = (unsigned char)(val)) + +#if 1 +void sha256_emit(unsigned char md[32], const unsigned int h[8]); +#else +static void sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + unsigned int h_i; + + h_i = h[0]; __TOBE32(md + 0, h_i); + h_i = h[1]; __TOBE32(md + 4, h_i); + h_i = h[2]; __TOBE32(md + 8, h_i); + h_i = h[3]; __TOBE32(md + 12, h_i); + h_i = h[4]; __TOBE32(md + 16, h_i); + h_i = h[5]; __TOBE32(md + 20, h_i); + h_i = h[6]; __TOBE32(md + 24, h_i); + h_i = h[7]; __TOBE32(md + 28, h_i); +} +#endif + +static void sha256_final(unsigned char md[32], SHA256_CTX *ctx) +{ + unsigned long long bits = ctx->N * 8; + size_t n = ctx->off; + unsigned char *tail; + + ctx->buf[n++] = 0x80; + + if (n > (sizeof(ctx->buf) - 8)) { + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + } + + tail = ctx->buf + sizeof(ctx->buf) - 8; + __TOBE32(tail, (unsigned int)(bits >> 32)); + __TOBE32(tail + 4, (unsigned int)bits); + sha256_block_data_order(ctx->h, ctx->buf, 1); + sha256_emit(md, ctx->h); +} + +#undef __TOBE32 +#endif diff --git a/crypto/blst_src/sqrt-addchain.h b/crypto/blst_src/sqrt-addchain.h new file mode 100644 index 00000000000..4e7f0beb6b1 --- /dev/null +++ b/crypto/blst_src/sqrt-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which + * yields reciprocal of sqrt(x), which is used in simplified Shallue- + * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt + * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x) + * as 'x*ret^2==1'). + * + * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 458 (16) <<< + * # Bos-Coster (win=5) : 460 (28) + * # Bos-Coster (win=6) : 461 (33) + * # Bos-Coster (win=7) : 460 (28) + * # Bos-Coster (win=3) : 462 ( 9) + * # Bos-Coster (win=8) : 466 (34) + * # Bos-Coster (win=9) : 464 (31) + * # Yacobi : 478 (31) + * # Bos-Coster (win=10) : 473 (30) + * # Bos-Coster (win=2) : 486 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 489 ( 5) + */ + +#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[13], inp, sizeof(ptype));/* 0: 1 */\ +sqr(t[0], t[13]); /* 1: 2 */\ +mul(t[8], t[0], t[13]); /* 2: 3 */\ +sqr(t[4], t[0]); /* 3: 4 */\ +mul(t[1], t[8], t[0]); /* 4: 5 */\ +mul(t[6], t[4], t[8]); /* 5: 7 */\ +mul(t[9], t[1], t[4]); /* 6: 9 */\ +mul(t[12], t[6], t[4]); /* 7: b */\ +mul(t[3], t[9], t[4]); /* 8: d */\ +mul(t[7], t[12], t[4]); /* 9: f */\ +mul(t[15], t[3], t[4]); /* 10: 11 */\ +mul(t[10], t[7], t[4]); /* 11: 13 */\ +mul(t[2], t[15], t[4]); /* 12: 15 */\ +mul(t[11], t[10], t[4]); /* 13: 17 */\ +sqr(t[0], t[3]); /* 14: 1a */\ +mul(t[14], t[11], t[4]); /* 15: 1b */\ +mul(t[5], t[0], t[8]); /* 16: 1d */\ +mul(t[4], t[0], t[1]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[7]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[9]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[13]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[12]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[9]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[5]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[10]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[1]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[7]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[6]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +sqr(out, t[0]); /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +} while(0) diff --git a/crypto/blst_src/sqrt.c b/crypto/blst_src/sqrt.c new file mode 100644 index 00000000000..cf149fd1124 --- /dev/null +++ b/crypto/blst_src/sqrt.c @@ -0,0 +1,261 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + static const byte BLS_12_381_P_minus_3_div_4[] = { + TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff), + TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af), + TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6) + }; + + exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0); +} +#else +# if 1 +/* + * "383"-bit variant omits full reductions at the ends of squarings, + * which results in up to ~15% improvement. [One can improve further + * by omitting full reductions even after multiplications and + * performing final reduction at the very end of the chain.] + */ +static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b); } +# else +static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ + while(count--) { + sqr_fp(out, a); + a = out; + } + mul_fp(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "sqrt-addchain.h" +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIP_SQRT_MOD_BLS12_381_P + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t1, t0, inp); + sqr_fp(t1, t1); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +static bool_t sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t0, t0, inp); + sqr_fp(t1, t0); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +int blst_fp_sqrt(vec384 out, const vec384 inp) +{ return (int)sqrt_fp(out, inp); } + +int blst_fp_is_square(const vec384 inp) +{ + return (int)ct_is_square_mod_384(inp, BLS12_381_P); +} + +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp) +{ + static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } }; + static const vec384x sqrt_sqrt_minus_1 = { + /* + * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)", + * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1, + * but it pivots into "complex" plane nevertheless... + */ + { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + static const vec384x sqrt_minus_sqrt_minus_1 = { + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + vec384x coeff, t0, t1; + bool_t is_sqrt, flag; + + /* + * Instead of multiple trial squarings we can perform just one + * and see if the result is "rotated by multiple of 90°" in + * relation to |inp|, and "rotate" |ret| accordingly. + */ + sqr_fp2(t0, sqrt); + /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */ + + /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */ + sub_fp2(t1, t0, inp); + is_sqrt = vec_is_zero(t1, sizeof(t1)); + vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff)); + + /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */ + add_fp2(t1, t0, inp); + vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */ + sub_fp(t1[0], t0[0], inp[1]); + add_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */ + add_fp(t1[0], t0[0], inp[1]); + sub_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* actual "rotation" */ + mul_fp2(out, ret, coeff); + + return is_sqrt; +} + +/* + * |inp| = a + b*i + */ +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, + const vec384x magic_ZZZ) +{ + vec384 aa, bb, cc; + vec384x inp_; + bool_t is_sqrt; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + is_sqrt = recip_sqrt_fp(cc, aa); /* 1/sqrt(a²+b²) */ + + /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ... */ + mul_fp2(inp_, inp, recip_ZZZ); + /* ... and adjust |aa| and |cc| accordingly */ + { + vec384 za, zc; + + mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²) */ + mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4) */ + vec_select(aa, aa, za, sizeof(aa), is_sqrt); + vec_select(cc, cc, zc, sizeof(cc), is_sqrt); + } + vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt); + + mul_fp(aa, aa, cc); /* sqrt(a²+b²) */ + + sub_fp(bb, inp_[0], aa); + add_fp(aa, inp_[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(out[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(out[1], inp_[1]); + mul_fp(out[1], out[1], out[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(out[0], out[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* bound to succeed */ + (void)sqrt_align_fp2(out, out, out, inp_); + + mul_fp(out[0], out[0], cc); /* inverse the result */ + mul_fp(out[1], out[1], cc); + neg_fp(out[1], out[1]); + + return is_sqrt; +} + +static bool_t sqrt_fp2(vec384x out, const vec384x inp) +{ + vec384x ret; + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + /* don't pay attention to return value, final "align" will tell... */ + (void)sqrt_fp(aa, aa); /* sqrt(a²+b²) */ + + sub_fp(bb, inp[0], aa); + add_fp(aa, inp[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(ret[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(ret[1], inp[1]); + mul_fp(ret[1], ret[1], ret[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(ret[0], ret[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* + * Now see if |ret| is or can be made sqrt(|inp|)... + */ + + return sqrt_align_fp2(out, ret, ret, inp); +} + +int blst_fp2_sqrt(vec384x out, const vec384x inp) +{ return (int)sqrt_fp2(out, inp); } + +int blst_fp2_is_square(const vec384x inp) +{ + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + return (int)ct_is_square_mod_384(aa, BLS12_381_P); +} diff --git a/crypto/blst_src/vect.c b/crypto/blst_src/vect.c new file mode 100644 index 00000000000..1834a48fadd --- /dev/null +++ b/crypto/blst_src/vect.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +#ifdef __BLST_NO_ASM__ +# include "no_asm.h" +#endif + +/* + * Following are some reference C implementations to assist new + * assembly modules development, as starting-point stand-ins and for + * cross-checking. In order to "polyfil" specific subroutine redefine + * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x. + */ + +#ifdef lshift_mod_384 +inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n, + const vec384 mod) +{ + while(n--) + add_mod_384(ret, a, a, mod), a = ret; +} +#endif + +#ifdef mul_by_8_mod_384 +inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ lshift_mod_384(ret, a, 3, mod); } +#endif + +#ifdef mul_by_3_mod_384 +inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a, a, mod); + add_mod_384(ret, t, a, mod); +} +#endif + +#ifdef mul_by_3_mod_384x +inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_3_mod_384(ret[0], a[0], mod); + mul_by_3_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_8_mod_384x +inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_8_mod_384(ret[0], a[0], mod); + mul_by_8_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_1_plus_i_mod_384x +inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, + const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a[0], a[1], mod); + sub_mod_384(ret[0], a[0], a[1], mod); + vec_copy(ret[1], t, sizeof(t)); +} +#endif + +#ifdef add_mod_384x +inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + add_mod_384(ret[0], a[0], b[0], mod); + add_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef sub_mod_384x +inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + sub_mod_384(ret[0], a[0], b[0], mod); + sub_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef lshift_mod_384x +inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n, + const vec384 mod) +{ + lshift_mod_384(ret[0], a[0], n, mod); + lshift_mod_384(ret[1], a[1], n, mod); +} +#endif + +#if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod, limb_t n0) +{ + vec768 t0, t1, t2; + vec384 aa, bb; + + mul_384(t0, a[0], b[0]); + mul_384(t1, a[1], b[1]); + + add_mod_384(aa, a[0], a[1], mod); + add_mod_384(bb, b[0], b[1], mod); + mul_384(t2, aa, bb); + sub_mod_384x384(t2, t2, t0, mod); + sub_mod_384x384(t2, t2, t1, mod); + + sub_mod_384x384(t0, t0, t1, mod); + + redc_mont_384(ret[0], t0, mod, n0); + redc_mont_384(ret[1], t2, mod, n0); +} +#endif + +#if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0) +{ + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], mod); + sub_mod_384(t1, a[0], a[1], mod); + + mul_mont_384(ret[1], a[0], a[1], mod, n0); + add_mod_384(ret[1], ret[1], ret[1], mod); + + mul_mont_384(ret[0], t0, t1, mod, n0); +} +#endif + +limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi); +limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); +limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); + +/* + * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place. + */ +static void div_by_zz(limb_t val[]) +{ + static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000), + TO_LIMB_T(0xac45a4010001a402) }; + size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]); + limb_t d_lo, d_hi; + + d_lo = zz[zz_len - 2]; + d_hi = zz[zz_len - 1]; + for (loop = zz_len, zz_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi); + (void)quot_rem_128(val + loop, zz, q); + } + /* remainder is in low half of val[], quotient is in high */ +} + +/* + * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place. + */ +static void div_by_z(limb_t val[]) +{ + static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) }; + size_t loop, z_len = sizeof(z)/sizeof(z[0]); + limb_t d_lo, d_hi; + + d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2]; + d_hi = z[z_len - 1]; + for (loop = z_len, z_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi); + (void)quot_rem_64(val + loop, z, q); + } + /* remainder is in low half of val[], quotient is in high */ +} diff --git a/crypto/blst_src/vect.h b/crypto/blst_src/vect.h new file mode 100644 index 00000000000..3211c8628cf --- /dev/null +++ b/crypto/blst_src/vect.h @@ -0,0 +1,418 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_VECT_H__ +#define __BLS12_381_ASM_VECT_H__ + +#include + +#if defined(__x86_64__) || defined(__aarch64__) +/* These are available even in ILP32 flavours, but even then they are + * capable of performing 64-bit operations as efficiently as in *P64. */ +typedef unsigned long long limb_t; +# define LIMB_T_BITS 64 + +#elif defined(_WIN64) /* Win64 is P64 */ +typedef unsigned __int64 limb_t; +# define LIMB_T_BITS 64 + +#elif defined(__BLST_NO_ASM__) || defined(__wasm64__) +typedef unsigned int limb_t; +# define LIMB_T_BITS 32 +# ifndef __BLST_NO_ASM__ +# define __BLST_NO_ASM__ +# endif + +#else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ +typedef unsigned long limb_t; +# ifdef _LP64 +# define LIMB_T_BITS 64 +# else +# define LIMB_T_BITS 32 +# define __BLST_NO_ASM__ +# endif +#endif + +/* + * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor + * knows nothing about sizeof(anything)... + */ +#if LIMB_T_BITS == 64 +# define TO_LIMB_T(limb64) limb64 +#else +# define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) +#endif + +#define NLIMBS(bits) (bits/LIMB_T_BITS) + +typedef limb_t vec256[NLIMBS(256)]; +typedef limb_t vec512[NLIMBS(512)]; +typedef limb_t vec384[NLIMBS(384)]; +typedef limb_t vec768[NLIMBS(768)]; +typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ + +typedef unsigned char byte; +#define TO_BYTES(limb64) (byte)limb64,(byte)(limb64>>8),\ + (byte)(limb64>>16),(byte)(limb64>>24),\ + (byte)(limb64>>32),(byte)(limb64>>40),\ + (byte)(limb64>>48),(byte)(limb64>>56) +typedef byte pow256[256/8]; + +/* + * Internal Boolean type, Bolean by value, hence safe to cast to or + * reinterpret as 'bool'. + */ +typedef limb_t bool_t; + +/* + * Assembly subroutines... + */ +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)\ + && !defined(__BLST_NO_ASM__) +# define mul_mont_sparse_256 mulx_mont_sparse_256 +# define sqr_mont_sparse_256 sqrx_mont_sparse_256 +# define from_mont_256 fromx_mont_256 +# define redc_mont_256 redcx_mont_256 +# define mul_mont_384 mulx_mont_384 +# define sqr_mont_384 sqrx_mont_384 +# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 +# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 +# define mul_384 mulx_384 +# define sqr_384 sqrx_384 +# define redc_mont_384 redcx_mont_384 +# define from_mont_384 fromx_mont_384 +# define sgn0_pty_mont_384 sgn0x_pty_mont_384 +# define sgn0_pty_mont_384x sgn0x_pty_mont_384x +# define ct_inverse_mod_383 ctx_inverse_mod_383 +#elif defined(__BLST_NO_ASM__) +# define ct_inverse_mod_383 ct_inverse_mod_384 +#endif + +void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, + const vec256 p, limb_t n0); +void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); +void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); +void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); + +void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); +void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p); +void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p, + const vec256 one); +limb_t check_mod_256(const pow256 a, const vec256 p); +limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); +limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); + +void vec_prefetch(const void *ptr, size_t len); + +void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, + const vec384 p, limb_t n0); +void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); + +void mul_384(vec768 ret, const vec384 a, const vec384 b); +void sqr_384(vec768 ret, const vec384 a); +void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); +void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); +limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); + +void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); +void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); +void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p); +void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p); +void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod, + const vec384 modx); +void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, + const vec256 modx); +bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); + +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) +# define mul_mont_384x mulx_mont_384x +# define sqr_mont_384x sqrx_mont_384x +# define sqr_mont_382x sqrx_mont_382x +# define sqr_n_mul_mont_384x sqrx_n_mul_mont_384x +# define mul_382x mulx_382x +# define sqr_382x sqrx_382x +#endif + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0); +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384x(vec384x ret, const vec384x a, size_t count, + const vec384 p, limb_t n0, const vec384x b); +void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); +void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); + +void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); +void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); + +/* + * C subroutines + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void div_by_zz(limb_t val[]); +static void div_by_z(limb_t val[]); + +#ifdef __UINTPTR_TYPE__ +typedef __UINTPTR_TYPE__ uptr_t; +#else +typedef const void *uptr_t; +#endif + +#if !defined(restrict) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define restrict __restrict__ +# elif defined(_MSC_VER) +# define restrict __restrict +# else +# define restrict +# endif +# endif +#endif + +#if !defined(inline) && !defined(__cplusplus) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define inline __inline__ +# elif defined(_MSC_VER) +# define inline __inline +# else +# define inline +# endif +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define launder(var) asm volatile("" : "+r"(var)) +#else +# define launder(var) +#endif + +static inline bool_t is_bit_set(const byte *v, size_t i) +{ + bool_t ret = (v[i/8] >> (i%8)) & 1; + launder(ret); + return ret; +} + +static inline bool_t byte_is_zero(unsigned char c) +{ + limb_t ret = ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1); + launder(ret); + return ret; +} + +static inline bool_t bytes_are_zero(const unsigned char *a, size_t num) +{ + unsigned char acc; + size_t i; + + for (acc = 0, i = 0; i < num; i++) + acc |= a[i]; + + return byte_is_zero(acc); +} + +static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, + bool_t cbit) +{ + limb_t ai, *ap = (limb_t *)a; + limb_t bi, *bp = (limb_t *)b; + limb_t xorm, mask = (limb_t)0 - cbit; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; + ap[i] = ai ^ xorm; + bp[i] = bi ^ xorm; + } +} + +/* ret = bit ? a : b */ +void vec_select_32(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a); +static inline void vec_select(void *ret, const void *a, const void *b, + size_t num, bool_t sel_a) +{ + launder(sel_a); +#ifndef __BLST_NO_ASM__ + if (num == 32) vec_select_32(ret, a, b, sel_a); + else if (num == 48) vec_select_48(ret, a, b, sel_a); + else if (num == 96) vec_select_96(ret, a, b, sel_a); + else if (num == 144) vec_select_144(ret, a, b, sel_a); + else if (num == 192) vec_select_192(ret, a, b, sel_a); + else if (num == 288) vec_select_288(ret, a, b, sel_a); +#else + if (0) ; +#endif + else { + limb_t bi; + volatile limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t xorm, mask = (limb_t)0 - sel_a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = (ap[i] ^ (bi = bp[i])) & mask; + rp[i] = bi ^ xorm; + } + } +} + +static inline bool_t is_zero(limb_t l) +{ + limb_t ret = (~l & (l - 1)) >> (LIMB_T_BITS - 1); + launder(ret); + return ret; +} + +static inline bool_t vec_is_zero(const void *a, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + limb_t acc; + size_t i; + +#ifndef __BLST_NO_ASM__ + bool_t vec_is_zero_16x(const void *a, size_t num); + if ((num & 15) == 0) + return vec_is_zero_16x(a, num); +#endif + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i]; + + return is_zero(acc); +} + +static inline bool_t vec_is_equal(const void *a, const void *b, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t acc; + size_t i; + +#ifndef __BLST_NO_ASM__ + bool_t vec_is_equal_16x(const void *a, const void *b, size_t num); + if ((num & 15) == 0) + return vec_is_equal_16x(a, b, num); +#endif + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i] ^ bp[i]; + + return is_zero(acc); +} + +static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag, + const vec384 p) +{ + cneg_mod_384(ret[0], a[0], flag, p); + cneg_mod_384(ret[1], a[1], flag, p); +} + +static inline void vec_copy(void *restrict ret, const void *a, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i]; +} + +static inline void vec_zero(void *ret, size_t num) +{ + volatile limb_t *rp = (volatile limb_t *)ret; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = 0; + +#if defined(__GNUC__) || defined(__clang__) + asm volatile("" : : "r"(ret) : "memory"); +#endif +} + +/* + * Some compilers get arguably overzealous(*) when passing pointer to + * multi-dimensional array [such as vec384x] as 'const' argument. + * General direction seems to be to legitimize such constification, + * so it's argued that suppressing the warning is appropriate. + * + * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm + */ +#if defined(__INTEL_COMPILER) +# pragma warning(disable:167) +# pragma warning(disable:556) +#elif defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic ignored "-Wpedantic" +#elif defined(_MSC_VER) +# pragma warning(disable: 4127 4189) +#endif + +#if !defined(__wasm__) +# include +#endif + +#if defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca(s) +# endif +#elif defined(__sun) +# include +#elif defined(_WIN32) +# include +# ifndef alloca +# define alloca(s) _alloca(s) +# endif +#endif + +#endif /* __BLS12_381_ASM_VECT_H__ */ From 635f96099add5121be56768898ff8d5d8daf36ee Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 18 Jan 2023 17:19:52 -0600 Subject: [PATCH 002/200] add blst header files and README --- crypto/blst_src/README.md | 10 + crypto/blst_src/blst.h | 483 +++++++++++++++++++++++++++++++++++++ crypto/blst_src/blst_aux.h | 102 ++++++++ 3 files changed, 595 insertions(+) create mode 100644 crypto/blst_src/README.md create mode 100644 crypto/blst_src/blst.h create mode 100644 crypto/blst_src/blst_aux.h diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md new file mode 100644 index 00000000000..ff835dbc640 --- /dev/null +++ b/crypto/blst_src/README.md @@ -0,0 +1,10 @@ +WIP + +Files copied from BLST repo https://github.com/supranational/blst. +TODO: License and copyright mention + +content: +- all /src files (C source files) +- all /build (assembly generated files) +- /bindings/blst.h (headers of external functions) +- /bindings/blst_aux.h (headers of external aux functions) \ No newline at end of file diff --git a/crypto/blst_src/blst.h b/crypto/blst_src/blst.h new file mode 100644 index 00000000000..24213ded2c5 --- /dev/null +++ b/crypto/blst_src/blst.h @@ -0,0 +1,483 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_H__ +#define __BLST_H__ + +#ifdef __SIZE_TYPE__ +typedef __SIZE_TYPE__ size_t; +#else +#include +#endif + +#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ + && defined(__UINT64_TYPE__) +typedef __UINT8_TYPE__ uint8_t; +typedef __UINT32_TYPE__ uint32_t; +typedef __UINT64_TYPE__ uint64_t; +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#elif defined(__BLST_CGO__) +typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ +#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 +# define bool _Bool +#else +# define bool int +#endif + +#ifdef SWIG +# define DEFNULL =NULL +#elif defined __cplusplus +# define DEFNULL =0 +#else +# define DEFNULL +#endif + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, + BLST_BAD_SCALAR, +} BLST_ERROR; + +typedef uint8_t byte; +typedef uint64_t limb_t; + +typedef struct { byte b[256/8]; } blst_scalar; +typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; +typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; +/* 0 is "real" part, 1 is "imaginary" */ +typedef struct { blst_fp fp[2]; } blst_fp2; +typedef struct { blst_fp2 fp2[3]; } blst_fp6; +typedef struct { blst_fp6 fp6[2]; } blst_fp12; + +void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); +void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); +void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); +void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); +void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); +void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); +void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); +void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); +bool blst_scalar_fr_check(const blst_scalar *a); +bool blst_sk_check(const blst_scalar *a); +bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); +bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); +bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); + +#ifndef SWIG +/* + * BLS12-381-specifc Fr operations. + */ +void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); +void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sqr(blst_fr *ret, const blst_fr *a); +void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); +void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); +void blst_fr_inverse(blst_fr *ret, const blst_fr *a); +#ifdef BLST_FR_PENTAROOT +void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); +void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); +#endif + +void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); +void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); +void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); +void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); + +/* + * BLS12-381-specifc Fp operations. + */ +void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); +void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); +void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); +void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sqr(blst_fp *ret, const blst_fp *a); +void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); +void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); +void blst_fp_inverse(blst_fp *ret, const blst_fp *a); +bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); + +void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); +void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); +void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); +void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); +void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); +void blst_bendian_from_fp(byte ret[48], const blst_fp *a); +void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); +void blst_lendian_from_fp(byte ret[48], const blst_fp *a); + +/* + * BLS12-381-specifc Fp2 operations. + */ +void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); +void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); +void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); +bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); + +/* + * BLS12-381-specifc Fp12 operations. + */ +void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); +void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, + const blst_fp6 *xy00z0); +void blst_fp12_conjugate(blst_fp12 *a); +void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); +bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); +bool blst_fp12_is_one(const blst_fp12 *a); +bool blst_fp12_in_group(const blst_fp12 *a); +const blst_fp12 *blst_fp12_one(); +#endif // SWIG + +/* + * BLS12-381-specifc point operations. + */ +typedef struct { blst_fp x, y, z; } blst_p1; +typedef struct { blst_fp x, y; } blst_p1_affine; + +void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_double(blst_p1 *out, const blst_p1 *a); +void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p1_cneg(blst_p1 *p, bool cbit); +void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); +void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); +bool blst_p1_on_curve(const blst_p1 *p); +bool blst_p1_in_g1(const blst_p1 *p); +bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); +bool blst_p1_is_inf(const blst_p1 *a); +const blst_p1 *blst_p1_generator(); + +bool blst_p1_affine_on_curve(const blst_p1_affine *p); +bool blst_p1_affine_in_g1(const blst_p1_affine *p); +bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); +bool blst_p1_affine_is_inf(const blst_p1_affine *a); +const blst_p1_affine *blst_p1_affine_generator(); + +typedef struct { blst_fp2 x, y, z; } blst_p2; +typedef struct { blst_fp2 x, y; } blst_p2_affine; + +void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_double(blst_p2 *out, const blst_p2 *a); +void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); +void blst_p2_cneg(blst_p2 *p, bool cbit); +void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); +void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); +bool blst_p2_on_curve(const blst_p2 *p); +bool blst_p2_in_g2(const blst_p2 *p); +bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); +bool blst_p2_is_inf(const blst_p2 *a); +const blst_p2 *blst_p2_generator(); + +bool blst_p2_affine_on_curve(const blst_p2_affine *p); +bool blst_p2_affine_in_g2(const blst_p2_affine *p); +bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); +bool blst_p2_affine_is_inf(const blst_p2_affine *a); +const blst_p2_affine *blst_p2_affine_generator(); + +/* + * Multi-scalar multiplications and other multi-point operations. + */ + +void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], + size_t npoints); +void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints); + +size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, + const blst_p1_affine *const points[], + size_t npoints); +size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], + size_t npoints); +void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints); + +size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, + const blst_p2_affine *const points[], + size_t npoints); +size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +/* + * Hash-to-curve operations. + */ +#ifndef SWIG +void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); +void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); +#endif + +void blst_encode_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +void blst_encode_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +/* + * Zcash-compatible serialization/deserialization. + */ +void blst_p1_serialize(byte out[96], const blst_p1 *in); +void blst_p1_compress(byte out[48], const blst_p1 *in); +void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); +void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); +BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); +BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); + +void blst_p2_serialize(byte out[192], const blst_p2 *in); +void blst_p2_compress(byte out[96], const blst_p2 *in); +void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); +void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); +BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); +BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); + +/* + * Specification defines two variants, 'minimal-signature-size' and + * 'minimal-pubkey-size'. To unify appearance we choose to distinguish + * them by suffix referring to the public key type, more specifically + * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to + * 'minimal-signature-size'. It might appear a bit counterintuitive + * in sign call, but no matter how you twist it, something is bound to + * turn a little odd. + */ +/* + * Secret-key operations. + */ +void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, + const blst_scalar *SK); +void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, + const blst_scalar *SK); + +/* + * Pairing interface. + */ +#ifndef SWIG +void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, + const blst_p1_affine *P); +void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); +void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); +void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], + const blst_p1_affine *P); +bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); +#endif + +#ifdef __BLST_CGO__ +typedef limb_t blst_pairing; +#elif defined(__BLST_RUST_BINDGEN__) +typedef struct {} blst_pairing; +#else +typedef struct blst_opaque blst_pairing; +#endif + +size_t blst_pairing_sizeof(); +void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, + const byte *DST DEFNULL, size_t DST_len DEFNULL); +const byte *blst_pairing_get_dst(const blst_pairing *ctx); +void blst_pairing_commit(blst_pairing *ctx); +BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); +bool blst_pairing_finalverify(const blst_pairing *ctx, + const blst_fp12 *gtsig DEFNULL); + + +/* + * Customarily applications aggregate signatures separately. + * In which case application would have to pass NULLs for |signature| + * to blst_pairing_aggregate calls and pass aggregated signature + * collected with these calls to blst_pairing_finalverify. Inputs are + * Zcash-compatible "straight-from-wire" byte vectors, compressed or + * not. + */ +BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, + const byte *zwire); +BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, + const byte *zwire); + +void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); +void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); + +/* + * "One-shot" CoreVerify entry points. + */ +BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, + const blst_p2_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, + const blst_p1_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); + +extern const blst_p1_affine BLS12_381_G1; +extern const blst_p1_affine BLS12_381_NEG_G1; +extern const blst_p2_affine BLS12_381_G2; +extern const blst_p2_affine BLS12_381_NEG_G2; + +#include "blst_aux.h" + +#ifdef __cplusplus +} +#endif +#endif diff --git a/crypto/blst_src/blst_aux.h b/crypto/blst_src/blst_aux.h new file mode 100644 index 00000000000..6d444fc1729 --- /dev/null +++ b/crypto/blst_src/blst_aux.h @@ -0,0 +1,102 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_AUX_H__ +#define __BLST_AUX_H__ +/* + * This file lists interfaces that might be promoted to blst.h or removed, + * depending on their proven/unproven worthiness. + */ + +void blst_fr_to(blst_fr *ret, const blst_fr *a); +void blst_fr_from(blst_fr *ret, const blst_fr *a); + +void blst_fp_to(blst_fp *ret, const blst_fp *a); +void blst_fp_from(blst_fp *ret, const blst_fp *a); + +bool blst_fp_is_square(const blst_fp *a); +bool blst_fp2_is_square(const blst_fp2 *a); + +void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); +void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); + +/* + * Below functions produce both point and deserialized outcome of + * SkToPk and Sign. However, deserialized outputs are pre-decorated + * with sign and infinity bits. This means that you have to bring the + * output into compliance prior returning to application. If you want + * compressed point value, then do [equivalent of] + * + * byte temp[96]; + * blst_sk_to_pk2_in_g1(temp, out_pk, SK); + * temp[0] |= 0x80; + * memcpy(out, temp, 48); + * + * Otherwise do + * + * blst_sk_to_pk2_in_g1(out, out_pk, SK); + * out[0] &= ~0x20; + * + * Either |out| or |out_| can be NULL. + */ +void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, + const blst_p2 *hash, const blst_scalar *SK); +void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, + const blst_p1 *hash, const blst_scalar *SK); + +typedef struct {} blst_uniq; + +size_t blst_uniq_sizeof(size_t n_nodes); +void blst_uniq_init(blst_uniq *tree); +bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +void blst_expand_message_xmd(byte *out, size_t out_len, + const byte *msg, size_t msg_len, + const byte *DST, size_t DST_len); +#endif + +void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); + +void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, + const blst_p1_affine *p); +blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); +void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a); + +void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *salt, size_t salt_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *salt, size_t salt_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_derive_master_eip2333(blst_scalar *out_SK, + const byte *IKM, size_t IKM_len); +void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK, + uint32_t child_index); + +void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex); +void blst_fr_from_hexascii(blst_fr *ret, const byte *hex); +void blst_fp_from_hexascii(blst_fp *ret, const byte *hex); + +size_t blst_p1_sizeof(); +size_t blst_p1_affine_sizeof(); +size_t blst_p2_sizeof(); +size_t blst_p2_affine_sizeof(); +size_t blst_fp12_sizeof(); +#endif From abe0fe10ca53e3880816e77b4895296a38b64813 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 18 Jan 2023 17:20:57 -0600 Subject: [PATCH 003/200] cgo directives to compile blst files --- crypto/bls.go | 3 +++ crypto/bls12381_utils.go | 3 +-- crypto/bls12381_utils.h | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 1d725ebab63..9e64d283c2d 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -33,7 +33,10 @@ package crypto // #cgo CFLAGS: -g -Wall -std=c99 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s +// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx +// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls_include.h" +// #include "blst.h" import "C" import ( diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 4138d35a599..f9a94beb1ee 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -7,10 +7,9 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low +// #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "bls12381_utils.h" -// #include "bls_include.h" import "C" import ( "errors" diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index c7e3587f664..de2efe9cb53 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -8,8 +8,7 @@ #define _REL_MISC_INCLUDE_H #include "relic.h" - -typedef uint8_t byte; +#include "blst.h" #define VALID RLC_OK #define INVALID RLC_ERR From 59be5248277a2014b4ea7a4c1fa86f5eed5299ad Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 23 Jan 2023 20:11:41 -0600 Subject: [PATCH 004/200] update README and clean up C directives --- crypto/bls.go | 3 --- crypto/bls12381_utils.go | 3 +++ crypto/blst_src/README.md | 14 ++++++++++---- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 9e64d283c2d..1d725ebab63 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -33,10 +33,7 @@ package crypto // #cgo CFLAGS: -g -Wall -std=c99 // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s -// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx -// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls_include.h" -// #include "blst.h" import "C" import ( diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index f9a94beb1ee..7e327571e47 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -9,7 +9,10 @@ package crypto // #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s +// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx +// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls12381_utils.h" +// #include "blst.h" import "C" import ( "errors" diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index ff835dbc640..c5867bcd742 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -1,9 +1,15 @@ -WIP +All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst +specifically from the commit <92c12ac58095de04e776cec5ef5ce5bdf242b693>. -Files copied from BLST repo https://github.com/supranational/blst. -TODO: License and copyright mention + Copyright Supranational LLC + Licensed under the Apache License, Version 2.0, see LICENSE for details. + SPDX-License-Identifier: Apache-2.0 -content: +While BLST exports multiple functions and tools, the implementation in Flow crypto requires access to low level functions. Some of these tools are not exported by BLST, others would need to be used without paying for the cgo cost, and therefore without using the Go bindings in BLST. + + +The folder contains: +- BLST LICENSE file - all /src files (C source files) - all /build (assembly generated files) - /bindings/blst.h (headers of external functions) From 56b4b5608f17a02c6b68c098f1fa05d4572e1649 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 25 Jan 2023 12:48:49 -0800 Subject: [PATCH 005/200] remove non code files --- crypto/blst_src/README.md | 2 +- crypto/blst_src/blst_t.hpp | 538 ------------------------------- crypto/blst_src/build/refresh.sh | 49 --- 3 files changed, 1 insertion(+), 588 deletions(-) delete mode 100644 crypto/blst_src/blst_t.hpp delete mode 100755 crypto/blst_src/build/refresh.sh diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index c5867bcd742..12bc7b863ca 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -10,7 +10,7 @@ While BLST exports multiple functions and tools, the implementation in Flow cryp The folder contains: - BLST LICENSE file -- all /src files (C source files) +- all /src/*.c and /src/*.h files (C source files) - all /build (assembly generated files) - /bindings/blst.h (headers of external functions) - /bindings/blst_aux.h (headers of external aux functions) \ No newline at end of file diff --git a/crypto/blst_src/blst_t.hpp b/crypto/blst_src/blst_t.hpp deleted file mode 100644 index 1b150da30ce..00000000000 --- a/crypto/blst_src/blst_t.hpp +++ /dev/null @@ -1,538 +0,0 @@ -// Copyright Supranational LLC -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef __BLST_T_HPP__ -#define __BLST_T_HPP__ - -/* - * These templates, blst_384_t and blst_256_t, allow to instantiate slim - * C++ shims to blst assembly with arbitrary moduli. Well, not literally - * arbitrary, as there are limitations. Most notably blst_384_t can not - * actually accommodate 384-bit moduli, only 383 and narrower. This is - * because of ct_inverse_mod_383's limitation. Though if you abstain - * from the reciprocal() method, even 384-bit modulus would work. As for - * blst_256_t, modulus has to be not larger than 2^256-2^192-1. - */ - -#ifdef __GNUC__ -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wunused-function" -#endif - -extern "C" { -#include "vect.h" -} -#include "bytes.h" - -#undef launder // avoid conflict with C++ >=17 - -#ifdef __GNUC__ -# pragma GCC diagnostic pop -#endif - -static inline void vec_left_align(limb_t *out, const limb_t *inp, size_t n) -{ - const unsigned int nbits = sizeof(inp[0])*8; - unsigned int align = 0; - limb_t top = inp[n-1]; - - if (top) { - while ((top >> (nbits-1)) == 0) - top <<= 1, align++; - } - if (align) { - while (--n) { - limb_t next = inp[n-1]; - out[n] = top | next >> (nbits-align); - top = next << align; - } - out[0] = top; - } else { - for (size_t i = 0; i < n-1; i++) - out[i] = inp[i]; - out[n-1] = top; - } -} - -constexpr static inline size_t vec_nbits(const limb_t *inp, size_t n) -{ - const unsigned int nbits = sizeof(inp[0])*8; - size_t align = 0; - limb_t top = inp[n-1]; - - while ((top >> (nbits-1)) == 0) - top <<= 1, align++; - - return n*nbits - align; -} - -template -class blst_384_t { -private: - vec384 val; - - inline operator const limb_t*() const { return val; } - inline operator limb_t*() { return val; } - inline limb_t& operator[](size_t i) { return val[i]; } - inline const limb_t& operator[](size_t i) const { return val[i]; } - -public: - static const size_t n = sizeof(vec384)/sizeof(limb_t); - static const size_t nbits = vec_nbits(MOD, n); - typedef byte pow_t[384/8]; - - inline blst_384_t() {} - inline blst_384_t(const vec384 p, bool align = false) - { - if (align) - vec_left_align(val, p, n); - else - vec_copy(val, p, sizeof(val)); - } - inline blst_384_t(uint64_t a) - { - vec_zero(val, sizeof(val)); - val[0] = a; - if (a) to(); - } - inline blst_384_t(int a) : blst_384_t((uint64_t)a) {} - - inline void to_scalar(pow_t& scalar) const - { - const union { - long one; - char little; - } is_endian = { 1 }; - - if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) { - from_mont_384((limb_t *)scalar, val, MOD, M0); - } else { - vec384 out; - from_mont_384(out, val, MOD, M0); - le_bytes_from_limbs(scalar, out, sizeof(pow_t)); - vec_zero(out, sizeof(out)); - } - } - - static inline const blst_384_t& one() - { return *reinterpret_cast(ONE); } - - inline blst_384_t& to() - { mul_mont_384(val, RR, val, MOD, M0); return *this; } - inline blst_384_t& from() - { from_mont_384(val, val, MOD, M0); return *this; } - - inline void store(limb_t *p) const - { vec_copy(p, val, sizeof(val)); } - - inline blst_384_t& operator+=(const blst_384_t& b) - { add_mod_384(val, val, b, MOD); return *this; } - friend inline blst_384_t operator+(const blst_384_t& a, const blst_384_t& b) - { - blst_384_t ret; - add_mod_384(ret, a, b, MOD); - return ret; - } - - inline blst_384_t& operator<<=(unsigned l) - { lshift_mod_384(val, val, l, MOD); return *this; } - friend inline blst_384_t operator<<(const blst_384_t& a, unsigned l) - { - blst_384_t ret; - lshift_mod_384(ret, a, l, MOD); - return ret; - } - - inline blst_384_t& operator>>=(unsigned r) - { rshift_mod_384(val, val, r, MOD); return *this; } - friend inline blst_384_t operator>>(blst_384_t a, unsigned r) - { - blst_384_t ret; - rshift_mod_384(ret, a, r, MOD); - return ret; - } - - inline blst_384_t& operator-=(const blst_384_t& b) - { sub_mod_384(val, val, b, MOD); return *this; } - friend inline blst_384_t operator-(const blst_384_t& a, const blst_384_t& b) - { - blst_384_t ret; - sub_mod_384(ret, a, b, MOD); - return ret; - } - - inline blst_384_t& cneg(bool flag) - { cneg_mod_384(val, val, flag, MOD); return *this; } - friend inline blst_384_t cneg(const blst_384_t& a, bool flag) - { - blst_384_t ret; - cneg_mod_384(ret, a, flag, MOD); - return ret; - } - friend inline blst_384_t operator-(const blst_384_t& a) - { - blst_384_t ret; - cneg_mod_384(ret, a, true, MOD); - return ret; - } - - inline blst_384_t& operator*=(const blst_384_t& a) - { - if (this == &a) sqr_mont_384(val, val, MOD, M0); - else mul_mont_384(val, val, a, MOD, M0); - return *this; - } - friend inline blst_384_t operator*(const blst_384_t& a, const blst_384_t& b) - { - blst_384_t ret; - if (&a == &b) sqr_mont_384(ret, a, MOD, M0); - else mul_mont_384(ret, a, b, MOD, M0); - return ret; - } - - // simplified exponentiation, but mind the ^ operator's precedence! - friend inline blst_384_t operator^(const blst_384_t& a, unsigned p) - { - if (p < 2) { - abort(); - } else if (p == 2) { - blst_384_t ret; - sqr_mont_384(ret, a, MOD, M0); - return ret; - } else { - blst_384_t ret; - sqr_mont_384(ret, a, MOD, M0); - for (p -= 2; p--;) - mul_mont_384(ret, ret, a, MOD, M0); - return ret; - } - } - inline blst_384_t& operator^=(unsigned p) - { - if (p < 2) { - abort(); - } else if (p == 2) { - sqr_mont_384(val, val, MOD, M0); - return *this; - } - return *this = *this^p; - } - inline blst_384_t operator()(unsigned p) - { return *this^p; } - friend inline blst_384_t sqr(const blst_384_t& a) - { return a^2; } - - inline bool is_zero() const - { return vec_is_zero(val, sizeof(val)); } - - inline void zero() - { vec_zero(val, sizeof(val)); } - - blst_384_t reciprocal() const - { - static const blst_384_t MODx{MOD, true}; - static const blst_384_t RRx4 = *reinterpret_cast(RR)<<2; - union { vec768 x; vec384 r[2]; } temp; - - ct_inverse_mod_383(temp.x, val, MOD, MODx); - redc_mont_384(temp.r[0], temp.x, MOD, M0); - mul_mont_384(temp.r[0], temp.r[0], RRx4, MOD, M0); - - return *reinterpret_cast(temp.r[0]); - } - friend inline blst_384_t operator/(unsigned one, const blst_384_t& a) - { - if (one == 1) - return a.reciprocal(); - abort(); - } - friend inline blst_384_t operator/(const blst_384_t& a, const blst_384_t& b) - { return a * b.reciprocal(); } - inline blst_384_t& operator/=(const blst_384_t& a) - { return *this *= a.reciprocal(); } - -#ifndef NDEBUG - inline blst_384_t(const char *hexascii) - { limbs_from_hexascii(val, sizeof(val), hexascii); to(); } - - friend inline bool operator==(const blst_384_t& a, const blst_384_t& b) - { return vec_is_equal(a, b, sizeof(vec384)); } - friend inline bool operator!=(const blst_384_t& a, const blst_384_t& b) - { return !vec_is_equal(a, b, sizeof(vec384)); } - -# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard - friend std::ostream& operator<<(std::ostream& os, const blst_384_t& obj) - { - unsigned char be[sizeof(obj)]; - char buf[2+2*sizeof(obj)+1], *str = buf; - - be_bytes_from_limbs(be, blst_384_t{obj}.from(), sizeof(obj)); - - *str++ = '0', *str++ = 'x'; - for (size_t i = 0; i < sizeof(obj); i++) - *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]); - *str = '\0'; - - return os << buf; - } -# endif -#endif -}; - -template -class blst_256_t { - vec256 val; - - inline operator const limb_t*() const { return val; } - inline operator limb_t*() { return val; } - inline limb_t& operator[](size_t i) { return val[i]; } - inline const limb_t& operator[](size_t i) const { return val[i]; } - -public: - static const size_t n = sizeof(vec256)/sizeof(limb_t); - static const size_t nbits = vec_nbits(MOD, n); - typedef byte pow_t[256/8]; - - inline blst_256_t() {} - inline blst_256_t(const vec256 p, bool align = false) - { - if (align) - vec_left_align(val, p, n); - else - vec_copy(val, p, sizeof(val)); - } - inline blst_256_t(uint64_t a) - { - vec_zero(val, sizeof(val)); - val[0] = a; - if (a) to(); - } - inline blst_256_t(int a) : blst_256_t((uint64_t)a) {} - - inline void to_scalar(pow_t& scalar) const - { - const union { - long one; - char little; - } is_endian = { 1 }; - - if ((size_t)scalar%sizeof(limb_t) == 0 && is_endian.little) { - from_mont_256((limb_t *)scalar, val, MOD, M0); - } else { - vec256 out; - from_mont_256(out, val, MOD, M0); - le_bytes_from_limbs(scalar, out, sizeof(pow_t)); - vec_zero(out, sizeof(out)); - } - } - - static inline const blst_256_t& one() - { return *reinterpret_cast(ONE); } - - inline blst_256_t& to() - { mul_mont_sparse_256(val, val, RR, MOD, M0); return *this; } - inline blst_256_t& to(const uint64_t a[2*n]) - { - mul_mont_sparse_256(val, RR, (const limb_t*)(a + n), MOD, M0); - vec256 lo{0}; - add_mod_256(lo, lo, (const limb_t*)a, MOD); - add_mod_256(val, val, lo, MOD); - mul_mont_sparse_256(val, RR, val, MOD, M0); - - return *this; - } - blst_256_t& to(const unsigned char* bytes, size_t n, bool le = false) - { - vec_zero(val, sizeof(val)); - - vec256 digit, zero{0}; - size_t rem = (n - 1) % 32 + 1; - n -= rem; - - if (le) { - limbs_from_le_bytes(val, bytes += n, rem); - mul_mont_sparse_256(val, RR, val, MOD, M0); - while (n) { - limbs_from_le_bytes(digit, bytes -= 32, 32); - add_mod_256(digit, digit, zero, MOD); - add_mod_256(val, val, digit, MOD); - mul_mont_sparse_256(val, RR, val, MOD, M0); - n -= 32; - } - } else { - limbs_from_be_bytes(val, bytes, rem); - mul_mont_sparse_256(val, RR, val, MOD, M0); - bytes += rem; - while (n) { - limbs_from_be_bytes(digit, bytes, 32); - add_mod_256(digit, digit, zero, MOD); - add_mod_256(val, val, digit, MOD); - mul_mont_sparse_256(val, RR, val, MOD, M0); - bytes += 32; - n -= 32; - } - } - - return *this; - } - - inline blst_256_t& from() - { from_mont_256(val, val, MOD, M0); return *this; } - - inline void store(limb_t *p) const - { vec_copy(p, val, sizeof(val)); } - - inline blst_256_t& operator+=(const blst_256_t& b) - { add_mod_256(val, val, b, MOD); return *this; } - friend inline blst_256_t operator+(const blst_256_t& a, const blst_256_t& b) - { - blst_256_t ret; - add_mod_256(ret, a, b, MOD); - return ret; - } - - inline blst_256_t& operator<<=(unsigned l) - { lshift_mod_256(val, val, l, MOD); return *this; } - friend inline blst_256_t operator<<(const blst_256_t& a, unsigned l) - { - blst_256_t ret; - lshift_mod_256(ret, a, l, MOD); - return ret; - } - - inline blst_256_t& operator>>=(unsigned r) - { lshift_mod_256(val, val, r, MOD); return *this; } - friend inline blst_256_t operator>>(blst_256_t a, unsigned r) - { - blst_256_t ret; - lshift_mod_256(ret, a, r, MOD); - return ret; - } - - inline blst_256_t& operator-=(const blst_256_t& b) - { sub_mod_256(val, val, b, MOD); return *this; } - friend inline blst_256_t operator-(const blst_256_t& a, const blst_256_t& b) - { - blst_256_t ret; - sub_mod_256(ret, a, b, MOD); - return ret; - } - - inline blst_256_t& cneg(bool flag) - { cneg_mod_256(val, val, flag, MOD); return *this; } - friend inline blst_256_t cneg(const blst_256_t& a, bool flag) - { - blst_256_t ret; - cneg_mod_256(ret, a, flag, MOD); - return ret; - } - friend inline blst_256_t operator-(const blst_256_t& a) - { - blst_256_t ret; - cneg_mod_256(ret, a, true, MOD); - return ret; - } - - inline blst_256_t& operator*=(const blst_256_t& a) - { - if (this == &a) sqr_mont_sparse_256(val, val, MOD, M0); - else mul_mont_sparse_256(val, val, a, MOD, M0); - return *this; - } - friend inline blst_256_t operator*(const blst_256_t& a, const blst_256_t& b) - { - blst_256_t ret; - if (&a == &b) sqr_mont_sparse_256(ret, a, MOD, M0); - else mul_mont_sparse_256(ret, a, b, MOD, M0); - return ret; - } - - // simplified exponentiation, but mind the ^ operator's precedence! - friend inline blst_256_t operator^(const blst_256_t& a, unsigned p) - { - if (p < 2) { - abort(); - } else if (p == 2) { - blst_256_t ret; - sqr_mont_sparse_256(ret, a, MOD, M0); - return ret; - } else { - blst_256_t ret; - sqr_mont_sparse_256(ret, a, MOD, M0); - for (p -= 2; p--;) - mul_mont_sparse_256(ret, ret, a, MOD, M0); - return ret; - } - } - inline blst_256_t& operator^=(unsigned p) - { - if (p < 2) { - abort(); - } else if (p == 2) { - sqr_mont_sparse_256(val, val, MOD, M0); - return *this; - } - return *this = *this^p; - } - inline blst_256_t operator()(unsigned p) - { return *this^p; } - friend inline blst_256_t sqr(const blst_256_t& a) - { return a^2; } - - inline bool is_zero() const - { return vec_is_zero(val, sizeof(val)); } - - inline void zero() - { vec_zero(val, sizeof(val)); } - - blst_256_t reciprocal() const - { - static const blst_256_t MODx{MOD, true}; - union { vec512 x; vec256 r[2]; } temp; - - ct_inverse_mod_256(temp.x, val, MOD, MODx); - redc_mont_256(temp.r[0], temp.x, MOD, M0); - mul_mont_sparse_256(temp.r[0], temp.r[0], RR, MOD, M0); - - return *reinterpret_cast(temp.r[0]); - } - friend inline blst_256_t operator/(int one, const blst_256_t& a) - { - if (one == 1) - return a.reciprocal(); - abort(); - } - friend inline blst_256_t operator/(const blst_256_t& a, const blst_256_t& b) - { return a * b.reciprocal(); } - inline blst_256_t& operator/=(const blst_256_t& a) - { return *this *= a.reciprocal(); } - -#ifndef NDEBUG - inline blst_256_t(const char *hexascii) - { limbs_from_hexascii(val, sizeof(val), hexascii); to(); } - - friend inline bool operator==(const blst_256_t& a, const blst_256_t& b) - { return vec_is_equal(a, b, sizeof(vec256)); } - friend inline bool operator!=(const blst_256_t& a, const blst_256_t& b) - { return !vec_is_equal(a, b, sizeof(vec256)); } - -# if defined(_GLIBCXX_IOSTREAM) || defined(_IOSTREAM_) // non-standard - friend std::ostream& operator<<(std::ostream& os, const blst_256_t& obj) - { - unsigned char be[sizeof(obj)]; - char buf[2+2*sizeof(obj)+1], *str=buf; - - be_bytes_from_limbs(be, blst_256_t{obj}.from(), sizeof(obj)); - - *str++ = '0', *str++ = 'x'; - for (size_t i = 0; i < sizeof(obj); i++) - *str++ = hex_from_nibble(be[i]>>4), *str++ = hex_from_nibble(be[i]); - *str = '\0'; - - return os << buf; - } -# endif -#endif -}; -#endif diff --git a/crypto/blst_src/build/refresh.sh b/crypto/blst_src/build/refresh.sh deleted file mode 100755 index e8c8137c287..00000000000 --- a/crypto/blst_src/build/refresh.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/sh - -HERE=`dirname $0` -cd "${HERE}" - -PERL=${PERL:-perl} - -for pl in ../src/asm/*-x86_64.pl; do - s=`basename $pl .pl`.asm - expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s) - s=`basename $pl .pl`.s - (set -x; ${PERL} $pl elf > elf/$s) - (set -x; ${PERL} $pl mingw64 > coff/$s) - (set -x; ${PERL} $pl macosx > mach-o/$s) -done - -for pl in ../src/asm/*-armv8.pl; do - s=`basename $pl .pl`.asm - (set -x; ${PERL} $pl win64 > win64/$s) - s=`basename $pl .pl`.S - (set -x; ${PERL} $pl linux64 > elf/$s) - (set -x; ${PERL} $pl coff64 > coff/$s) - (set -x; ${PERL} $pl ios64 > mach-o/$s) -done - -( cd ../bindings; - echo "LIBRARY blst" - echo - echo "EXPORTS" - cc -E blst.h | \ - ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) && print "\t$1\n" }' - echo -) > win64/blst.def - -if which bindgen > /dev/null 2>&1; then - ( cd ../bindings; set -x; - bindgen --opaque-type blst_pairing \ - --opaque-type blst_uniq \ - --with-derive-default \ - --with-derive-eq \ - --size_t-is-usize \ - --rustified-enum BLST.\* \ - blst.h -- -D__BLST_RUST_BINDGEN__ \ - | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs - ) -else - echo "Install Rust bindgen with 'cargo install bindgen'" 1>&2 - exit 1 -fi From 3c5accd9ac33e2f035e3156729b893ae21c21b07 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 14 Feb 2023 00:57:16 -0600 Subject: [PATCH 006/200] disable c99 and compile blst src and assembly --- crypto/bls.go | 2 +- crypto/bls12381_utils.go | 5 +- crypto/bls12381_utils.h | 2 +- crypto/bls_crossBLST_test.go | 47 +- crypto/bls_multisig.go | 2 +- crypto/bls_thresholdsign.go | 2 +- .../build/assembly.S => blst_assembly.S} | 0 crypto/blst_include.h | 7 + crypto/{blst_src/server.c => blst_src.c} | 0 crypto/blst_src/asm/add_mod_256-armv8.pl | 412 --- crypto/blst_src/asm/add_mod_256-x86_64.pl | 547 ---- crypto/blst_src/asm/add_mod_384-armv8.pl | 937 ------ crypto/blst_src/asm/add_mod_384-x86_64.pl | 1500 --------- crypto/blst_src/asm/add_mod_384x384-x86_64.pl | 260 -- crypto/blst_src/asm/arm-xlate.pl | 386 --- .../blst_src/asm/ct_inverse_mod_256-armv8.pl | 586 ---- .../blst_src/asm/ct_inverse_mod_256-x86_64.pl | 837 ------ .../blst_src/asm/ct_inverse_mod_384-armv8.pl | 610 ---- .../asm/ct_is_square_mod_384-armv8.pl | 401 --- .../asm/ct_is_square_mod_384-x86_64.pl | 494 --- .../asm/ctq_inverse_mod_384-x86_64.pl | 886 ------ .../asm/ctx_inverse_mod_384-x86_64.pl | 995 ------ crypto/blst_src/asm/div3w-armv8.pl | 122 - crypto/blst_src/asm/div3w-x86_64.pl | 184 -- crypto/blst_src/asm/mul_mont_256-armv8.pl | 409 --- crypto/blst_src/asm/mul_mont_384-armv8.pl | 2015 ------------- crypto/blst_src/asm/mulq_mont_256-x86_64.pl | 513 ---- crypto/blst_src/asm/mulq_mont_384-x86_64.pl | 2675 ----------------- crypto/blst_src/asm/mulx_mont_256-x86_64.pl | 486 --- crypto/blst_src/asm/mulx_mont_384-x86_64.pl | 2384 --------------- crypto/blst_src/asm/sha256-armv8.pl | 541 ---- crypto/blst_src/asm/sha256-portable-x86_64.pl | 337 --- crypto/blst_src/asm/sha256-x86_64.pl | 789 ----- crypto/blst_src/asm/x86_64-xlate.pl | 1781 ----------- crypto/blst_src/client_min_pk.c | 4 +- crypto/blst_src/client_min_sig.c | 4 +- crypto/dkg_feldmanvss.go | 2 +- crypto/dkg_feldmanvssq.go | 2 +- crypto/dkg_jointfeldman.go | 2 +- crypto/spock.go | 2 +- 40 files changed, 48 insertions(+), 21122 deletions(-) rename crypto/{blst_src/build/assembly.S => blst_assembly.S} (100%) create mode 100644 crypto/blst_include.h rename crypto/{blst_src/server.c => blst_src.c} (100%) delete mode 100755 crypto/blst_src/asm/add_mod_256-armv8.pl delete mode 100755 crypto/blst_src/asm/add_mod_256-x86_64.pl delete mode 100755 crypto/blst_src/asm/add_mod_384-armv8.pl delete mode 100755 crypto/blst_src/asm/add_mod_384-x86_64.pl delete mode 100755 crypto/blst_src/asm/add_mod_384x384-x86_64.pl delete mode 100755 crypto/blst_src/asm/arm-xlate.pl delete mode 100755 crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl delete mode 100755 crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl delete mode 100755 crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl delete mode 100755 crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl delete mode 100755 crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl delete mode 100755 crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl delete mode 100755 crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl delete mode 100755 crypto/blst_src/asm/div3w-armv8.pl delete mode 100755 crypto/blst_src/asm/div3w-x86_64.pl delete mode 100755 crypto/blst_src/asm/mul_mont_256-armv8.pl delete mode 100755 crypto/blst_src/asm/mul_mont_384-armv8.pl delete mode 100755 crypto/blst_src/asm/mulq_mont_256-x86_64.pl delete mode 100755 crypto/blst_src/asm/mulq_mont_384-x86_64.pl delete mode 100755 crypto/blst_src/asm/mulx_mont_256-x86_64.pl delete mode 100755 crypto/blst_src/asm/mulx_mont_384-x86_64.pl delete mode 100755 crypto/blst_src/asm/sha256-armv8.pl delete mode 100755 crypto/blst_src/asm/sha256-portable-x86_64.pl delete mode 100755 crypto/blst_src/asm/sha256-x86_64.pl delete mode 100755 crypto/blst_src/asm/x86_64-xlate.pl diff --git a/crypto/bls.go b/crypto/bls.go index 1d725ebab63..6786f00c4d5 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -31,7 +31,7 @@ package crypto // - membership checks G2 using Bowe's method (https://eprint.iacr.org/2019/814.pdf) // - implement a G1/G2 swap (signatures on G2 and public keys on G1) -// #cgo CFLAGS: -g -Wall -std=c99 +// #cgo CFLAGS: // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "bls_include.h" import "C" diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 7e327571e47..fa931cffab6 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -7,18 +7,17 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls12381_utils.h" -// #include "blst.h" import "C" import ( "errors" ) -// Go wrappers to Relic C types +// Go wrappers around Relic C types // Relic is compiled with ALLOC=AUTO type pointG1 C.ep_st type pointG2 C.ep2_st diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index de2efe9cb53..d6978d6188d 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -8,7 +8,7 @@ #define _REL_MISC_INCLUDE_H #include "relic.h" -#include "blst.h" +#include "blst_include.h" #define VALID RLC_OK #define INVALID RLC_ERR diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index f2ef6f16431..9ed78de2873 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -16,19 +16,26 @@ package crypto // both libraries might have made different choices. It is nevertheless a good flag for possible bugs or deviations // from the standard as both libraries are being developed. -import ( +/*import ( "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" blst "github.com/supranational/blst/bindings/go" "pgregory.net/rapid" -) + + "github.com/onflow/flow-go/crypto" +)*/ + +// TODO: this file can't compile because of duplicate C and assembly symbols (the ones used +// by the current library and the same ones used by the imported package BLST). Unfortunately, +// cgo doesn't differentiate the two symbols. These tests need to be rewritten using the internal +// BLST C exports, instead of importing the Go BLST package. // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library -func validPrivateKeyBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) +/*func validPrivateKeyBytesFlow(t *rapid.T) []byte { + seed := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte) + sk, err := crypto.GeneratePrivateKey(crypto.BLSBLS12381, seed) // TODO: require.NoError(t, err) seems to mess with rapid if err != nil { assert.FailNow(t, "failed key generation") @@ -38,18 +45,18 @@ func validPrivateKeyBytesFlow(t *rapid.T) []byte { // validPublicKeyBytesFlow generates bytes of a valid public key in Flow library func validPublicKeyBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) + seed := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte) + sk, err := crypto.GeneratePrivateKey(crypto.BLSBLS12381, seed) require.NoError(t, err) return sk.PublicKey().Encode() } // validSignatureBytesFlow generates bytes of a valid signature in Flow library func validSignatureBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) + seed := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "seed").([]byte) + sk, err := crypto.GeneratePrivateKey(crypto.BLSBLS12381, seed) require.NoError(t, err) - hasher := NewExpandMsgXOFKMAC128("random_tag") + hasher := crypto.NewExpandMsgXOFKMAC128("random_tag") message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte) signature, err := sk.Sign(message, hasher) require.NoError(t, err) @@ -58,14 +65,14 @@ func validSignatureBytesFlow(t *rapid.T) []byte { // validPrivateKeyBytesBLST generates bytes of a valid private key in BLST library func validPrivateKeyBytesBLST(t *rapid.T) []byte { - randomSlice := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381) ikm := randomSlice.Draw(t, "ikm").([]byte) return blst.KeyGen(ikm).Serialize() } // validPublicKeyBytesBLST generates bytes of a valid public key in BLST library func validPublicKeyBytesBLST(t *rapid.T) []byte { - ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "ikm").([]byte) + ikm := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "ikm").([]byte) blstS := blst.KeyGen(ikm) blstG2 := new(blst.P2Affine).From(blstS) return blstG2.Compress() @@ -73,7 +80,7 @@ func validPublicKeyBytesBLST(t *rapid.T) []byte { // validSignatureBytesBLST generates bytes of a valid signature in BLST library func validSignatureBytesBLST(t *rapid.T) []byte { - ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLenBLSBLS12381, KeyGenSeedMaxLenBLSBLS12381).Draw(t, "ikm").([]byte) + ikm := rapid.SliceOfN(rapid.Byte(), crypto.KeyGenSeedMinLenBLSBLS12381, crypto.KeyGenSeedMaxLenBLSBLS12381).Draw(t, "ikm").([]byte) blstS := blst.KeyGen(ikm[:]) blstG1 := new(blst.P1Affine).From(blstS) return blstG1.Compress() @@ -82,14 +89,14 @@ func validSignatureBytesBLST(t *rapid.T) []byte { // testEncodeDecodePrivateKeyCrossBLST tests encoding and decoding of private keys are consistent with BLST. // This test assumes private key serialization is identical to the one in BLST. func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), prKeyLengthBLSBLS12381, prKeyLengthBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.PrKeyLenBLSBLS12381, crypto.PrKeyLenBLSBLS12381) validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow) validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST) // skBytes are bytes of either a valid or a random private key skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) // check decoding results are consistent - skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) + skFlow, err := crypto.DecodePrivateKey(crypto.BLSBLS12381, skBytes) var skBLST blst.Scalar res := skBLST.Deserialize(skBytes) @@ -109,14 +116,14 @@ func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { // testEncodeDecodePublicKeyCrossBLST tests encoding and decoding of public keys keys are consistent with BLST. // This test assumes public key serialization is identical to the one in BLST. func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), PubKeyLenBLSBLS12381, PubKeyLenBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.PubKeyLenBLSBLS12381, crypto.PubKeyLenBLSBLS12381) validSliceFlow := rapid.Custom(validPublicKeyBytesFlow) validSliceBLST := rapid.Custom(validPublicKeyBytesBLST) // pkBytes are bytes of either a valid or a random public key pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) // check decoding results are consistent - pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes) + pkFlow, err := crypto.DecodePublicKey(crypto.BLSBLS12381, pkBytes) var pkBLST blst.P2Affine res := pkBLST.Deserialize(pkBytes) pkValidBLST := pkBLST.KeyValidate() @@ -137,7 +144,7 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { // testEncodeDecodeSignatureCrossBLST tests encoding and decoding of signatures are consistent with BLST. // This test assumes signature serialization is identical to the one in BLST. func testEncodeDecodeSignatureCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), SignatureLenBLSBLS12381, SignatureLenBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.SignatureLenBLSBLS12381, crypto.SignatureLenBLSBLS12381) validSignatureFlow := rapid.Custom(validSignatureBytesFlow) validSignatureBLST := rapid.Custom(validSignatureBytesBLST) // sigBytes are bytes of either a valid or a random signature @@ -180,7 +187,7 @@ func testSignHashCrossBLST(t *rapid.T) { // generate two private keys from the same seed skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte) - skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) + skFlow, err := crypto.DecodePrivateKey(crypto.BLSBLS12381, skBytes) require.NoError(t, err) var skBLST blst.Scalar res := skBLST.Deserialize(skBytes) @@ -208,4 +215,4 @@ func TestAgainstBLST(t *testing.T) { rapid.Check(t, testEncodeDecodePublicKeyCrossBLST) rapid.Check(t, testEncodeDecodeSignatureCrossBLST) rapid.Check(t, testSignHashCrossBLST) -} +}*/ diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index 1dfe29abc05..a915bed4a64 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -27,7 +27,7 @@ import ( // - batch verification of multiple signatures of a single message under multiple // public keys: use a binary tree of aggregations to find the invalid signatures. -// #cgo CFLAGS: -g -Wall -std=c99 +// #cgo CFLAGS: // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "bls_include.h" import "C" diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 4256af84ab9..4aa73278d3a 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -3,7 +3,7 @@ package crypto -// #cgo CFLAGS: -g -Wall -std=c99 +// #cgo CFLAGS: // #include "bls_thresholdsign_include.h" import "C" diff --git a/crypto/blst_src/build/assembly.S b/crypto/blst_assembly.S similarity index 100% rename from crypto/blst_src/build/assembly.S rename to crypto/blst_assembly.S diff --git a/crypto/blst_include.h b/crypto/blst_include.h new file mode 100644 index 00000000000..586f6069590 --- /dev/null +++ b/crypto/blst_include.h @@ -0,0 +1,7 @@ +#ifndef __BLST_INCLUDE_H__ +#define __BLST_INCLUDE_H__ + +// blst related definitions +// eventually this file would replace blst.h + +#endif \ No newline at end of file diff --git a/crypto/blst_src/server.c b/crypto/blst_src.c similarity index 100% rename from crypto/blst_src/server.c rename to crypto/blst_src.c diff --git a/crypto/blst_src/asm/add_mod_256-armv8.pl b/crypto/blst_src/asm/add_mod_256-armv8.pl deleted file mode 100755 index 34d9145261b..00000000000 --- a/crypto/blst_src/asm/add_mod_256-armv8.pl +++ /dev/null @@ -1,412 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); - -@mod=map("x$_",(4..7)); -@a=map("x$_",(8..11)); -@b=map("x$_",(12..15)); -@t=map("x$_",(16,17,1..3)); - -$code.=<<___; -.text - -.globl add_mod_256 -.hidden add_mod_256 -.type add_mod_256,%function -.align 5 -add_mod_256: - ldp @a[0],@a[1],[$a_ptr] - ldp @b[0],@b[1],[$b_ptr] - - ldp @a[2],@a[3],[$a_ptr,#16] - adds @a[0],@a[0],@b[0] - ldp @b[2],@b[3],[$b_ptr,#16] - adcs @a[1],@a[1],@b[1] - ldp @mod[0],@mod[1],[$n_ptr] - adcs @a[2],@a[2],@b[2] - ldp @mod[2],@mod[3],[$n_ptr,#16] - adcs @a[3],@a[3],@b[3] - adc @t[4],xzr,xzr - - subs @t[0],@a[0],@mod[0] - sbcs @t[1],@a[1],@mod[1] - sbcs @t[2],@a[2],@mod[2] - sbcs @t[3],@a[3],@mod[3] - sbcs xzr,@t[4],xzr - - csel @a[0],@a[0],@t[0],lo - csel @a[1],@a[1],@t[1],lo - csel @a[2],@a[2],@t[2],lo - stp @a[0],@a[1],[$r_ptr] - csel @a[3],@a[3],@t[3],lo - stp @a[2],@a[3],[$r_ptr,#16] - - ret -.size add_mod_256,.-add_mod_256 - -.globl mul_by_3_mod_256 -.hidden mul_by_3_mod_256 -.type mul_by_3_mod_256,%function -.align 5 -mul_by_3_mod_256: - ldp @b[0],@b[1],[$a_ptr] - ldp @b[2],@b[3],[$a_ptr,#16] - - adds @a[0],@b[0],@b[0] - ldp @mod[0],@mod[1],[$b_ptr] - adcs @a[1],@b[1],@b[1] - ldp @mod[2],@mod[3],[$b_ptr,#16] - adcs @a[2],@b[2],@b[2] - adcs @a[3],@b[3],@b[3] - adc @t[4],xzr,xzr - - subs @t[0],@a[0],@mod[0] - sbcs @t[1],@a[1],@mod[1] - sbcs @t[2],@a[2],@mod[2] - sbcs @t[3],@a[3],@mod[3] - sbcs xzr,@t[4],xzr - - csel @a[0],@a[0],@t[0],lo - csel @a[1],@a[1],@t[1],lo - csel @a[2],@a[2],@t[2],lo - csel @a[3],@a[3],@t[3],lo - - adds @a[0],@a[0],@b[0] - adcs @a[1],@a[1],@b[1] - adcs @a[2],@a[2],@b[2] - adcs @a[3],@a[3],@b[3] - adc @t[4],xzr,xzr - - subs @t[0],@a[0],@mod[0] - sbcs @t[1],@a[1],@mod[1] - sbcs @t[2],@a[2],@mod[2] - sbcs @t[3],@a[3],@mod[3] - sbcs xzr,@t[4],xzr - - csel @a[0],@a[0],@t[0],lo - csel @a[1],@a[1],@t[1],lo - csel @a[2],@a[2],@t[2],lo - stp @a[0],@a[1],[$r_ptr] - csel @a[3],@a[3],@t[3],lo - stp @a[2],@a[3],[$r_ptr,#16] - - ret -.size mul_by_3_mod_256,.-mul_by_3_mod_256 - -.globl lshift_mod_256 -.hidden lshift_mod_256 -.type lshift_mod_256,%function -.align 5 -lshift_mod_256: - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - -.Loop_lshift_mod_256: - adds @a[0],@a[0],@a[0] - sub $b_ptr,$b_ptr,#1 - adcs @a[1],@a[1],@a[1] - adcs @a[2],@a[2],@a[2] - adcs @a[3],@a[3],@a[3] - adc @t[4],xzr,xzr - - subs @b[0],@a[0],@mod[0] - sbcs @b[1],@a[1],@mod[1] - sbcs @b[2],@a[2],@mod[2] - sbcs @b[3],@a[3],@mod[3] - sbcs xzr,@t[4],xzr - - csel @a[0],@a[0],@b[0],lo - csel @a[1],@a[1],@b[1],lo - csel @a[2],@a[2],@b[2],lo - csel @a[3],@a[3],@b[3],lo - - cbnz $b_ptr,.Loop_lshift_mod_256 - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - - ret -.size lshift_mod_256,.-lshift_mod_256 - -.globl rshift_mod_256 -.hidden rshift_mod_256 -.type rshift_mod_256,%function -.align 5 -rshift_mod_256: - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - -.Loop_rshift: - adds @b[0],@a[0],@mod[0] - sub $b_ptr,$b_ptr,#1 - adcs @b[1],@a[1],@mod[1] - adcs @b[2],@a[2],@mod[2] - adcs @b[3],@a[3],@mod[3] - adc @t[4],xzr,xzr - tst @a[0],#1 - - csel @b[0],@b[0],@a[0],ne - csel @b[1],@b[1],@a[1],ne - csel @b[2],@b[2],@a[2],ne - csel @b[3],@b[3],@a[3],ne - csel @t[4],@t[4],xzr,ne - - extr @a[0],@b[1],@b[0],#1 - extr @a[1],@b[2],@b[1],#1 - extr @a[2],@b[3],@b[2],#1 - extr @a[3],@t[4],@b[3],#1 - - cbnz $b_ptr,.Loop_rshift - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - - ret -.size rshift_mod_256,.-rshift_mod_256 - -.globl cneg_mod_256 -.hidden cneg_mod_256 -.type cneg_mod_256,%function -.align 5 -cneg_mod_256: - ldp @a[0],@a[1],[$a_ptr] - ldp @mod[0],@mod[1],[$n_ptr] - - ldp @a[2],@a[3],[$a_ptr,#16] - subs @b[0],@mod[0],@a[0] - ldp @mod[2],@mod[3],[$n_ptr,#16] - orr @mod[0],@a[0],@a[1] - sbcs @b[1],@mod[1],@a[1] - orr @mod[1],@a[2],@a[3] - sbcs @b[2],@mod[2],@a[2] - orr @t[4],@mod[0],@mod[1] - sbc @b[3],@mod[3],@a[3] - - cmp @t[4],#0 - csetm @t[4],ne - ands $b_ptr,$b_ptr,@t[4] - - csel @a[0],@a[0],@b[0],eq - csel @a[1],@a[1],@b[1],eq - csel @a[2],@a[2],@b[2],eq - stp @a[0],@a[1],[$r_ptr] - csel @a[3],@a[3],@b[3],eq - stp @a[2],@a[3],[$r_ptr,#16] - - ret -.size cneg_mod_256,.-cneg_mod_256 - -.globl sub_mod_256 -.hidden sub_mod_256 -.type sub_mod_256,%function -.align 5 -sub_mod_256: - ldp @a[0],@a[1],[$a_ptr] - ldp @b[0],@b[1],[$b_ptr] - - ldp @a[2],@a[3],[$a_ptr,#16] - subs @a[0],@a[0],@b[0] - ldp @b[2],@b[3],[$b_ptr,#16] - sbcs @a[1],@a[1],@b[1] - ldp @mod[0],@mod[1],[$n_ptr] - sbcs @a[2],@a[2],@b[2] - ldp @mod[2],@mod[3],[$n_ptr,#16] - sbcs @a[3],@a[3],@b[3] - sbc @t[4],xzr,xzr - - and @mod[0],@mod[0],@t[4] - and @mod[1],@mod[1],@t[4] - adds @a[0],@a[0],@mod[0] - and @mod[2],@mod[2],@t[4] - adcs @a[1],@a[1],@mod[1] - and @mod[3],@mod[3],@t[4] - adcs @a[2],@a[2],@mod[2] - stp @a[0],@a[1],[$r_ptr] - adc @a[3],@a[3],@mod[3] - stp @a[2],@a[3],[$r_ptr,#16] - - ret -.size sub_mod_256,.-sub_mod_256 - -.globl check_mod_256 -.hidden check_mod_256 -.type check_mod_256,%function -.align 5 -check_mod_256: - ldp @a[0],@a[1],[$r_ptr] - ldp @a[2],@a[3],[$r_ptr,#16] - ldp @mod[0],@mod[1],[$a_ptr] - ldp @mod[2],@mod[3],[$a_ptr,#16] - -#ifdef __AARCH64EB__ - rev @a[0],@a[0] - rev @a[1],@a[1] - rev @a[2],@a[2] - rev @a[3],@a[3] -#endif - - subs xzr,@a[0],@mod[0] - sbcs xzr,@a[1],@mod[1] - orr @a[0],@a[0],@a[1] - sbcs xzr,@a[2],@mod[2] - orr @a[0],@a[0],@a[2] - sbcs xzr,@a[3],@mod[3] - orr @a[0],@a[0],@a[3] - sbc $a_ptr,xzr,xzr - - cmp @a[0],#0 - mov x0,#1 - csel x0,x0,xzr,ne - and x0,x0,$a_ptr - - ret -.size check_mod_256,.-check_mod_256 - -.globl add_n_check_mod_256 -.hidden add_n_check_mod_256 -.type add_n_check_mod_256,%function -.align 5 -add_n_check_mod_256: - ldp @a[0],@a[1],[$a_ptr] - ldp @b[0],@b[1],[$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @b[2],@b[3],[$b_ptr,#16] - -#ifdef __AARCH64EB__ - rev @a[0],@a[0] - rev @b[0],@b[0] - rev @a[1],@a[1] - rev @b[1],@b[1] - rev @a[2],@a[2] - rev @b[2],@b[2] - rev @a[3],@a[3] - rev @b[3],@b[3] -#endif - - adds @a[0],@a[0],@b[0] - ldp @mod[0],@mod[1],[$n_ptr] - adcs @a[1],@a[1],@b[1] - ldp @mod[2],@mod[3],[$n_ptr,#16] - adcs @a[2],@a[2],@b[2] - adcs @a[3],@a[3],@b[3] - adc @t[4],xzr,xzr - - subs @t[0],@a[0],@mod[0] - sbcs @t[1],@a[1],@mod[1] - sbcs @t[2],@a[2],@mod[2] - sbcs @t[3],@a[3],@mod[3] - sbcs xzr,@t[4],xzr - - csel @a[0],@a[0],@t[0],lo - csel @a[1],@a[1],@t[1],lo - csel @a[2],@a[2],@t[2],lo - csel @a[3],@a[3],@t[3],lo - - orr @t[0], @a[0], @a[1] - orr @t[1], @a[2], @a[3] - orr @t[0], @t[0], @t[1] - -#ifdef __AARCH64EB__ - rev @a[0],@a[0] - rev @a[1],@a[1] - rev @a[2],@a[2] - rev @a[3],@a[3] -#endif - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - - mov @t[1], #1 - cmp @t[0], #0 - csel x0, @t[1], xzr, ne - - ret -.size add_n_check_mod_256,.-add_n_check_mod_256 - -.globl sub_n_check_mod_256 -.hidden sub_n_check_mod_256 -.type sub_n_check_mod_256,%function -.align 5 -sub_n_check_mod_256: - ldp @a[0],@a[1],[$a_ptr] - ldp @b[0],@b[1],[$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @b[2],@b[3],[$b_ptr,#16] - -#ifdef __AARCH64EB__ - rev @a[0],@a[0] - rev @b[0],@b[0] - rev @a[1],@a[1] - rev @b[1],@b[1] - rev @a[2],@a[2] - rev @b[2],@b[2] - rev @a[3],@a[3] - rev @b[3],@b[3] -#endif - - subs @a[0],@a[0],@b[0] - sbcs @a[1],@a[1],@b[1] - ldp @mod[0],@mod[1],[$n_ptr] - sbcs @a[2],@a[2],@b[2] - ldp @mod[2],@mod[3],[$n_ptr,#16] - sbcs @a[3],@a[3],@b[3] - sbc @t[4],xzr,xzr - - and @mod[0],@mod[0],@t[4] - and @mod[1],@mod[1],@t[4] - adds @a[0],@a[0],@mod[0] - and @mod[2],@mod[2],@t[4] - adcs @a[1],@a[1],@mod[1] - and @mod[3],@mod[3],@t[4] - adcs @a[2],@a[2],@mod[2] - adc @a[3],@a[3],@mod[3] - - orr @t[0], @a[0], @a[1] - orr @t[1], @a[2], @a[3] - orr @t[0], @t[0], @t[1] - -#ifdef __AARCH64EB__ - rev @a[0],@a[0] - rev @a[1],@a[1] - rev @a[2],@a[2] - rev @a[3],@a[3] -#endif - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - - mov @t[1], #1 - cmp @t[0], #0 - csel x0, @t[1], xzr, ne - - ret -.size sub_n_check_mod_256,.-sub_n_check_mod_256 -___ - -print $code; - -close STDOUT; diff --git a/crypto/blst_src/asm/add_mod_256-x86_64.pl b/crypto/blst_src/asm/add_mod_256-x86_64.pl deleted file mode 100755 index 1d656fb90bf..00000000000 --- a/crypto/blst_src/asm/add_mod_256-x86_64.pl +++ /dev/null @@ -1,547 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -# common argument layout -($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx"); -$b_ptr = "%rbx"; - -{ ############################################################## 256 bits add -my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12)); - -$code.=<<___; -.text - -.globl add_mod_256 -.hidden add_mod_256 -.type add_mod_256,\@function,4,"unwind" -.align 32 -add_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - -.Loaded_a_add_mod_256: - add 8*0($b_org), @acc[0] - adc 8*1($b_org), @acc[1] - mov @acc[0], @acc[4] - adc 8*2($b_org), @acc[2] - mov @acc[1], @acc[5] - adc 8*3($b_org), @acc[3] - sbb $b_org, $b_org - - mov @acc[2], @acc[6] - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - mov @acc[3], @acc[7] - sbb 8*3($n_ptr), @acc[3] - sbb \$0, $b_org - - cmovc @acc[4], @acc[0] - cmovc @acc[5], @acc[1] - mov @acc[0], 8*0($r_ptr) - cmovc @acc[6], @acc[2] - mov @acc[1], 8*1($r_ptr) - cmovc @acc[7], @acc[3] - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size add_mod_256,.-add_mod_256 - -######################################################################## -.globl mul_by_3_mod_256 -.hidden mul_by_3_mod_256 -.type mul_by_3_mod_256,\@function,3,"unwind" -.align 32 -mul_by_3_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 -.cfi_end_prologue - - mov $b_org,$n_ptr - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov $a_ptr,$b_org - mov 8*3($a_ptr), @acc[3] - - call __lshift_mod_256 - mov 0(%rsp),%r12 -.cfi_restore %r12 - jmp .Loaded_a_add_mod_256 - - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size mul_by_3_mod_256,.-mul_by_3_mod_256 - -.type __lshift_mod_256,\@abi-omnipotent -.align 32 -__lshift_mod_256: - add @acc[0], @acc[0] - adc @acc[1], @acc[1] - mov @acc[0], @acc[4] - adc @acc[2], @acc[2] - mov @acc[1], @acc[5] - adc @acc[3], @acc[3] - sbb @acc[8], @acc[8] - - mov @acc[2], @acc[6] - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - mov @acc[3], @acc[7] - sbb 8*3($n_ptr), @acc[3] - sbb \$0, @acc[8] - - cmovc @acc[4], @acc[0] - cmovc @acc[5], @acc[1] - cmovc @acc[6], @acc[2] - cmovc @acc[7], @acc[3] - - ret -.size __lshift_mod_256,.-__lshift_mod_256 - -######################################################################## -.globl lshift_mod_256 -.hidden lshift_mod_256 -.type lshift_mod_256,\@function,4,"unwind" -.align 32 -lshift_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - -.Loop_lshift_mod_256: - call __lshift_mod_256 - dec %edx - jnz .Loop_lshift_mod_256 - - mov @acc[0], 8*0($r_ptr) - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - mov 0(%rsp),%r12 -.cfi_restore %r12 - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size lshift_mod_256,.-lshift_mod_256 - -######################################################################## -.globl rshift_mod_256 -.hidden rshift_mod_256 -.type rshift_mod_256,\@function,4,"unwind" -.align 32 -rshift_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[7] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - -.Loop_rshift_mod_256: - mov @acc[7], @acc[0] - and \$1, @acc[7] - mov 8*0($n_ptr), @acc[4] - neg @acc[7] - mov 8*1($n_ptr), @acc[5] - mov 8*2($n_ptr), @acc[6] - - and @acc[7], @acc[4] - and @acc[7], @acc[5] - and @acc[7], @acc[6] - and 8*3($n_ptr), @acc[7] - - add @acc[4], @acc[0] - adc @acc[5], @acc[1] - adc @acc[6], @acc[2] - adc @acc[7], @acc[3] - sbb @acc[4], @acc[4] - - shr \$1, @acc[0] - mov @acc[1], @acc[7] - shr \$1, @acc[1] - mov @acc[2], @acc[6] - shr \$1, @acc[2] - mov @acc[3], @acc[5] - shr \$1, @acc[3] - - shl \$63, @acc[7] - shl \$63, @acc[6] - or @acc[0], @acc[7] - shl \$63, @acc[5] - or @acc[6], @acc[1] - shl \$63, @acc[4] - or @acc[5], @acc[2] - or @acc[4], @acc[3] - - dec %edx - jnz .Loop_rshift_mod_256 - - mov @acc[7], 8*0($r_ptr) - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size rshift_mod_256,.-rshift_mod_256 - -######################################################################## -.globl cneg_mod_256 -.hidden cneg_mod_256 -.type cneg_mod_256,\@function,4,"unwind" -.align 32 -cneg_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[8] # load a[0:3] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov @acc[8], @acc[0] - mov 8*3($a_ptr), @acc[3] - or @acc[1], @acc[8] - or @acc[2], @acc[8] - or @acc[3], @acc[8] - mov \$-1, @acc[7] - - mov 8*0($n_ptr), @acc[4] # load n[0:3] - cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0 - mov 8*1($n_ptr), @acc[5] - mov 8*2($n_ptr), @acc[6] - and @acc[8], @acc[4] # n[0:3] &= mask - mov 8*3($n_ptr), @acc[7] - and @acc[8], @acc[5] - and @acc[8], @acc[6] - and @acc[8], @acc[7] - - sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0 - sbb @acc[1], @acc[5] - sbb @acc[2], @acc[6] - sbb @acc[3], @acc[7] - - or $b_org, $b_org # check condition flag - - cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3] - cmovz @acc[1], @acc[5] - mov @acc[4], 8*0($r_ptr) - cmovz @acc[2], @acc[6] - mov @acc[5], 8*1($r_ptr) - cmovz @acc[3], @acc[7] - mov @acc[6], 8*2($r_ptr) - mov @acc[7], 8*3($r_ptr) - - mov 0(%rsp),%r12 -.cfi_restore %r12 - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size cneg_mod_256,.-cneg_mod_256 - -######################################################################## -.globl sub_mod_256 -.hidden sub_mod_256 -.type sub_mod_256,\@function,4,"unwind" -.align 32 -sub_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - - sub 8*0($b_org), @acc[0] - mov 8*0($n_ptr), @acc[4] - sbb 8*1($b_org), @acc[1] - mov 8*1($n_ptr), @acc[5] - sbb 8*2($b_org), @acc[2] - mov 8*2($n_ptr), @acc[6] - sbb 8*3($b_org), @acc[3] - mov 8*3($n_ptr), @acc[7] - sbb $b_org, $b_org - - and $b_org, @acc[4] - and $b_org, @acc[5] - and $b_org, @acc[6] - and $b_org, @acc[7] - - add @acc[4], @acc[0] - adc @acc[5], @acc[1] - mov @acc[0], 8*0($r_ptr) - adc @acc[6], @acc[2] - mov @acc[1], 8*1($r_ptr) - adc @acc[7], @acc[3] - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size sub_mod_256,.-sub_mod_256 - -######################################################################## -.globl check_mod_256 -.hidden check_mod_256 -.type check_mod_256,\@function,2,"unwind" -.align 32 -check_mod_256: -.cfi_startproc - mov 8*0($r_ptr), %rax - mov 8*1($r_ptr), @acc[1] - mov 8*2($r_ptr), @acc[2] - mov 8*3($r_ptr), @acc[3] - - mov %rax, @acc[0] # see if it's zero - or @acc[1], %rax - or @acc[2], %rax - or @acc[3], %rax - - sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow? - sbb 8*1($a_ptr), @acc[1] - sbb 8*2($a_ptr), @acc[2] - sbb 8*3($a_ptr), @acc[3] - sbb $a_ptr, $a_ptr - - mov \$1, %rdx - cmp \$0, %rax - cmovne %rdx, %rax - and $a_ptr, %rax -.cfi_epilogue - ret -.cfi_endproc -.size check_mod_256,.-check_mod_256 - -######################################################################## -.globl add_n_check_mod_256 -.hidden add_n_check_mod_256 -.type add_n_check_mod_256,\@function,4,"unwind" -.align 32 -add_n_check_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - - add 8*0($b_org), @acc[0] - adc 8*1($b_org), @acc[1] - mov @acc[0], @acc[4] - adc 8*2($b_org), @acc[2] - mov @acc[1], @acc[5] - adc 8*3($b_org), @acc[3] - sbb $b_org, $b_org - - mov @acc[2], @acc[6] - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - mov @acc[3], @acc[7] - sbb 8*3($n_ptr), @acc[3] - sbb \$0, $b_org - - cmovc @acc[4], @acc[0] - cmovc @acc[5], @acc[1] - mov @acc[0], 8*0($r_ptr) - cmovc @acc[6], @acc[2] - mov @acc[1], 8*1($r_ptr) - cmovc @acc[7], @acc[3] - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - or @acc[1], @acc[0] - or @acc[3], @acc[2] - or @acc[2], @acc[0] - mov \$1, %rax - cmovz @acc[0], %rax - - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size add_n_check_mod_256,.-add_n_check_mod_256 - -######################################################################## -.globl sub_n_check_mod_256 -.hidden sub_n_check_mod_256 -.type sub_n_check_mod_256,\@function,4,"unwind" -.align 32 -sub_n_check_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - - sub 8*0($b_org), @acc[0] - mov 8*0($n_ptr), @acc[4] - sbb 8*1($b_org), @acc[1] - mov 8*1($n_ptr), @acc[5] - sbb 8*2($b_org), @acc[2] - mov 8*2($n_ptr), @acc[6] - sbb 8*3($b_org), @acc[3] - mov 8*3($n_ptr), @acc[7] - sbb $b_org, $b_org - - and $b_org, @acc[4] - and $b_org, @acc[5] - and $b_org, @acc[6] - and $b_org, @acc[7] - - add @acc[4], @acc[0] - adc @acc[5], @acc[1] - mov @acc[0], 8*0($r_ptr) - adc @acc[6], @acc[2] - mov @acc[1], 8*1($r_ptr) - adc @acc[7], @acc[3] - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - or @acc[1], @acc[0] - or @acc[3], @acc[2] - or @acc[2], @acc[0] - mov \$1, %rax - cmovz @acc[0], %rax - - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size sub_n_check_mod_256,.-sub_n_check_mod_256 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/add_mod_384-armv8.pl b/crypto/blst_src/asm/add_mod_384-armv8.pl deleted file mode 100755 index 6accdbb19a1..00000000000 --- a/crypto/blst_src/asm/add_mod_384-armv8.pl +++ /dev/null @@ -1,937 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); - -@mod=map("x$_",(4..9)); -@a=map("x$_",(10..15)); -@b=map("x$_",(16,17,19..22)); -$carry=$n_ptr; - -$code.=<<___; -.text - -.globl add_mod_384 -.hidden add_mod_384 -.type add_mod_384,%function -.align 5 -add_mod_384: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size add_mod_384,.-add_mod_384 - -.type __add_mod_384,%function -.align 5 -__add_mod_384: - ldp @a[0],@a[1],[$a_ptr] - ldp @b[0],@b[1],[$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @b[2],@b[3],[$b_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - ldp @b[4],@b[5],[$b_ptr,#32] - -__add_mod_384_ab_are_loaded: - adds @a[0],@a[0],@b[0] - adcs @a[1],@a[1],@b[1] - adcs @a[2],@a[2],@b[2] - adcs @a[3],@a[3],@b[3] - adcs @a[4],@a[4],@b[4] - adcs @a[5],@a[5],@b[5] - adc $carry,xzr,xzr - - subs @b[0],@a[0],@mod[0] - sbcs @b[1],@a[1],@mod[1] - sbcs @b[2],@a[2],@mod[2] - sbcs @b[3],@a[3],@mod[3] - sbcs @b[4],@a[4],@mod[4] - sbcs @b[5],@a[5],@mod[5] - sbcs xzr,$carry,xzr - - csel @a[0],@a[0],@b[0],lo - csel @a[1],@a[1],@b[1],lo - csel @a[2],@a[2],@b[2],lo - csel @a[3],@a[3],@b[3],lo - csel @a[4],@a[4],@b[4],lo - csel @a[5],@a[5],@b[5],lo - - ret -.size __add_mod_384,.-__add_mod_384 - -.globl add_mod_384x -.hidden add_mod_384x -.type add_mod_384x,%function -.align 5 -add_mod_384x: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - bl __add_mod_384 - - stp @a[0],@a[1],[$r_ptr] - add $a_ptr,$a_ptr,#48 - stp @a[2],@a[3],[$r_ptr,#16] - add $b_ptr,$b_ptr,#48 - stp @a[4],@a[5],[$r_ptr,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr,#48] - stp @a[2],@a[3],[$r_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size add_mod_384x,.-add_mod_384x - -.globl rshift_mod_384 -.hidden rshift_mod_384 -.type rshift_mod_384,%function -.align 5 -rshift_mod_384: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - -.Loop_rshift_mod_384: - sub $b_ptr,$b_ptr,#1 - bl __rshift_mod_384 - cbnz $b_ptr,.Loop_rshift_mod_384 - - ldr x30,[sp,#8] - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size rshift_mod_384,.-rshift_mod_384 - -.type __rshift_mod_384,%function -.align 5 -__rshift_mod_384: - sbfx @b[5],@a[0],#0,#1 - and @b[0],@b[5],@mod[0] - and @b[1],@b[5],@mod[1] - adds @a[0],@a[0],@b[0] - and @b[2],@b[5],@mod[2] - adcs @a[1],@a[1],@b[1] - and @b[3],@b[5],@mod[3] - adcs @a[2],@a[2],@b[2] - and @b[4],@b[5],@mod[4] - adcs @a[3],@a[3],@b[3] - and @b[5],@b[5],@mod[5] - adcs @a[4],@a[4],@b[4] - extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1 - adcs @a[5],@a[5],@b[5] - extr @a[1],@a[2],@a[1],#1 - adc @b[5],xzr,xzr - extr @a[2],@a[3],@a[2],#1 - extr @a[3],@a[4],@a[3],#1 - extr @a[4],@a[5],@a[4],#1 - extr @a[5],@b[5],@a[5],#1 - ret -.size __rshift_mod_384,.-__rshift_mod_384 - -.globl div_by_2_mod_384 -.hidden div_by_2_mod_384 -.type div_by_2_mod_384,%function -.align 5 -div_by_2_mod_384: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - bl __rshift_mod_384 - - ldr x30,[sp,#8] - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size div_by_2_mod_384,.-div_by_2_mod_384 - -.globl lshift_mod_384 -.hidden lshift_mod_384 -.type lshift_mod_384,%function -.align 5 -lshift_mod_384: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - -.Loop_lshift_mod_384: - sub $b_ptr,$b_ptr,#1 - bl __lshift_mod_384 - cbnz $b_ptr,.Loop_lshift_mod_384 - - ldr x30,[sp,#8] - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size lshift_mod_384,.-lshift_mod_384 - -.type __lshift_mod_384,%function -.align 5 -__lshift_mod_384: - adds @a[0],@a[0],@a[0] - adcs @a[1],@a[1],@a[1] - adcs @a[2],@a[2],@a[2] - adcs @a[3],@a[3],@a[3] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adc $carry,xzr,xzr - - subs @b[0],@a[0],@mod[0] - sbcs @b[1],@a[1],@mod[1] - sbcs @b[2],@a[2],@mod[2] - sbcs @b[3],@a[3],@mod[3] - sbcs @b[4],@a[4],@mod[4] - sbcs @b[5],@a[5],@mod[5] - sbcs xzr,$carry,xzr - - csel @a[0],@a[0],@b[0],lo - csel @a[1],@a[1],@b[1],lo - csel @a[2],@a[2],@b[2],lo - csel @a[3],@a[3],@b[3],lo - csel @a[4],@a[4],@b[4],lo - csel @a[5],@a[5],@b[5],lo - - ret -.size __lshift_mod_384,.-__lshift_mod_384 - -.globl mul_by_3_mod_384 -.hidden mul_by_3_mod_384 -.type mul_by_3_mod_384,%function -.align 5 -mul_by_3_mod_384: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - bl __lshift_mod_384 - - ldp @b[0],@b[1],[$a_ptr] - ldp @b[2],@b[3],[$a_ptr,#16] - ldp @b[4],@b[5],[$a_ptr,#32] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size mul_by_3_mod_384,.-mul_by_3_mod_384 - -.globl mul_by_8_mod_384 -.hidden mul_by_8_mod_384 -.type mul_by_8_mod_384,%function -.align 5 -mul_by_8_mod_384: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size mul_by_8_mod_384,.-mul_by_8_mod_384 - -.globl mul_by_3_mod_384x -.hidden mul_by_3_mod_384x -.type mul_by_3_mod_384x,%function -.align 5 -mul_by_3_mod_384x: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - bl __lshift_mod_384 - - ldp @b[0],@b[1],[$a_ptr] - ldp @b[2],@b[3],[$a_ptr,#16] - ldp @b[4],@b[5],[$a_ptr,#32] - - bl __add_mod_384_ab_are_loaded - - stp @a[0],@a[1],[$r_ptr] - ldp @a[0],@a[1],[$a_ptr,#48] - stp @a[2],@a[3],[$r_ptr,#16] - ldp @a[2],@a[3],[$a_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#32] - ldp @a[4],@a[5],[$a_ptr,#80] - - bl __lshift_mod_384 - - ldp @b[0],@b[1],[$a_ptr,#48] - ldp @b[2],@b[3],[$a_ptr,#64] - ldp @b[4],@b[5],[$a_ptr,#80] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr,#48] - stp @a[2],@a[3],[$r_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size mul_by_3_mod_384x,.-mul_by_3_mod_384x - -.globl mul_by_8_mod_384x -.hidden mul_by_8_mod_384x -.type mul_by_8_mod_384x,%function -.align 5 -mul_by_8_mod_384x: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - - stp @a[0],@a[1],[$r_ptr] - ldp @a[0],@a[1],[$a_ptr,#48] - stp @a[2],@a[3],[$r_ptr,#16] - ldp @a[2],@a[3],[$a_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#32] - ldp @a[4],@a[5],[$a_ptr,#80] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr,#48] - stp @a[2],@a[3],[$r_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size mul_by_8_mod_384x,.-mul_by_8_mod_384x - -.globl cneg_mod_384 -.hidden cneg_mod_384 -.type cneg_mod_384,%function -.align 5 -cneg_mod_384: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @mod[0],@mod[1],[$n_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @mod[2],@mod[3],[$n_ptr,#16] - - subs @b[0],@mod[0],@a[0] - ldp @a[4],@a[5],[$a_ptr,#32] - ldp @mod[4],@mod[5],[$n_ptr,#32] - orr $carry,@a[0],@a[1] - sbcs @b[1],@mod[1],@a[1] - orr $carry,$carry,@a[2] - sbcs @b[2],@mod[2],@a[2] - orr $carry,$carry,@a[3] - sbcs @b[3],@mod[3],@a[3] - orr $carry,$carry,@a[4] - sbcs @b[4],@mod[4],@a[4] - orr $carry,$carry,@a[5] - sbc @b[5],@mod[5],@a[5] - - cmp $carry,#0 - csetm $carry,ne - ands $b_ptr,$b_ptr,$carry - - csel @a[0],@a[0],@b[0],eq - csel @a[1],@a[1],@b[1],eq - csel @a[2],@a[2],@b[2],eq - csel @a[3],@a[3],@b[3],eq - stp @a[0],@a[1],[$r_ptr] - csel @a[4],@a[4],@b[4],eq - stp @a[2],@a[3],[$r_ptr,#16] - csel @a[5],@a[5],@b[5],eq - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size cneg_mod_384,.-cneg_mod_384 - -.globl sub_mod_384 -.hidden sub_mod_384 -.type sub_mod_384,%function -.align 5 -sub_mod_384: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size sub_mod_384,.-sub_mod_384 - -.type __sub_mod_384,%function -.align 5 -__sub_mod_384: - ldp @a[0],@a[1],[$a_ptr] - ldp @b[0],@b[1],[$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @b[2],@b[3],[$b_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - ldp @b[4],@b[5],[$b_ptr,#32] - - subs @a[0],@a[0],@b[0] - sbcs @a[1],@a[1],@b[1] - sbcs @a[2],@a[2],@b[2] - sbcs @a[3],@a[3],@b[3] - sbcs @a[4],@a[4],@b[4] - sbcs @a[5],@a[5],@b[5] - sbc $carry,xzr,xzr - - and @b[0],@mod[0],$carry - and @b[1],@mod[1],$carry - adds @a[0],@a[0],@b[0] - and @b[2],@mod[2],$carry - adcs @a[1],@a[1],@b[1] - and @b[3],@mod[3],$carry - adcs @a[2],@a[2],@b[2] - and @b[4],@mod[4],$carry - adcs @a[3],@a[3],@b[3] - and @b[5],@mod[5],$carry - adcs @a[4],@a[4],@b[4] - adc @a[5],@a[5],@b[5] - - ret -.size __sub_mod_384,.-__sub_mod_384 - -.globl sub_mod_384x -.hidden sub_mod_384x -.type sub_mod_384x,%function -.align 5 -sub_mod_384x: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - bl __sub_mod_384 - - stp @a[0],@a[1],[$r_ptr] - add $a_ptr,$a_ptr,#48 - stp @a[2],@a[3],[$r_ptr,#16] - add $b_ptr,$b_ptr,#48 - stp @a[4],@a[5],[$r_ptr,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr,#48] - stp @a[2],@a[3],[$r_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size sub_mod_384x,.-sub_mod_384x - -.globl mul_by_1_plus_i_mod_384x -.hidden mul_by_1_plus_i_mod_384x -.type mul_by_1_plus_i_mod_384x,%function -.align 5 -mul_by_1_plus_i_mod_384x: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - add $b_ptr,$a_ptr,#48 - - bl __sub_mod_384 // a->re - a->im - - ldp @b[0],@b[1],[$a_ptr] - ldp @b[2],@b[3],[$a_ptr,#16] - ldp @b[4],@b[5],[$a_ptr,#32] - stp @a[0],@a[1],[$r_ptr] - ldp @a[0],@a[1],[$a_ptr,#48] - stp @a[2],@a[3],[$r_ptr,#16] - ldp @a[2],@a[3],[$a_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#32] - ldp @a[4],@a[5],[$a_ptr,#80] - - bl __add_mod_384_ab_are_loaded // a->re + a->im - ldr x30,[sp,#8] - - stp @a[0],@a[1],[$r_ptr,#48] - stp @a[2],@a[3],[$r_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x - -.globl sgn0_pty_mod_384 -.hidden sgn0_pty_mod_384 -.type sgn0_pty_mod_384,%function -.align 5 -sgn0_pty_mod_384: - ldp @a[0],@a[1],[$r_ptr] - ldp @a[2],@a[3],[$r_ptr,#16] - ldp @a[4],@a[5],[$r_ptr,#32] - - ldp @mod[0],@mod[1],[$a_ptr] - ldp @mod[2],@mod[3],[$a_ptr,#16] - ldp @mod[4],@mod[5],[$a_ptr,#32] - - and $r_ptr,@a[0],#1 - adds @a[0],@a[0],@a[0] - adcs @a[1],@a[1],@a[1] - adcs @a[2],@a[2],@a[2] - adcs @a[3],@a[3],@a[3] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adc $carry,xzr,xzr - - subs @a[0],@a[0],@mod[0] - sbcs @a[1],@a[1],@mod[1] - sbcs @a[2],@a[2],@mod[2] - sbcs @a[3],@a[3],@mod[3] - sbcs @a[4],@a[4],@mod[4] - sbcs @a[5],@a[5],@mod[5] - sbc $carry,$carry,xzr - - mvn $carry,$carry - and $carry,$carry,#2 - orr $r_ptr,$r_ptr,$carry - - ret -.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 - -.globl sgn0_pty_mod_384x -.hidden sgn0_pty_mod_384x -.type sgn0_pty_mod_384x,%function -.align 5 -sgn0_pty_mod_384x: - ldp @a[0],@a[1],[$r_ptr] - ldp @a[2],@a[3],[$r_ptr,#16] - ldp @a[4],@a[5],[$r_ptr,#32] - - ldp @mod[0],@mod[1],[$a_ptr] - ldp @mod[2],@mod[3],[$a_ptr,#16] - ldp @mod[4],@mod[5],[$a_ptr,#32] - - and $b_ptr,@a[0],#1 - orr $n_ptr,@a[0],@a[1] - adds @a[0],@a[0],@a[0] - orr $n_ptr,$n_ptr,@a[2] - adcs @a[1],@a[1],@a[1] - orr $n_ptr,$n_ptr,@a[3] - adcs @a[2],@a[2],@a[2] - orr $n_ptr,$n_ptr,@a[4] - adcs @a[3],@a[3],@a[3] - orr $n_ptr,$n_ptr,@a[5] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adc @b[0],xzr,xzr - - subs @a[0],@a[0],@mod[0] - sbcs @a[1],@a[1],@mod[1] - sbcs @a[2],@a[2],@mod[2] - sbcs @a[3],@a[3],@mod[3] - sbcs @a[4],@a[4],@mod[4] - sbcs @a[5],@a[5],@mod[5] - sbc @b[0],@b[0],xzr - - ldp @a[0],@a[1],[$r_ptr,#48] - ldp @a[2],@a[3],[$r_ptr,#64] - ldp @a[4],@a[5],[$r_ptr,#80] - - mvn @b[0],@b[0] - and @b[0],@b[0],#2 - orr $b_ptr,$b_ptr,@b[0] - - and $r_ptr,@a[0],#1 - orr $a_ptr,@a[0],@a[1] - adds @a[0],@a[0],@a[0] - orr $a_ptr,$a_ptr,@a[2] - adcs @a[1],@a[1],@a[1] - orr $a_ptr,$a_ptr,@a[3] - adcs @a[2],@a[2],@a[2] - orr $a_ptr,$a_ptr,@a[4] - adcs @a[3],@a[3],@a[3] - orr $a_ptr,$a_ptr,@a[5] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adc @b[0],xzr,xzr - - subs @a[0],@a[0],@mod[0] - sbcs @a[1],@a[1],@mod[1] - sbcs @a[2],@a[2],@mod[2] - sbcs @a[3],@a[3],@mod[3] - sbcs @a[4],@a[4],@mod[4] - sbcs @a[5],@a[5],@mod[5] - sbc @b[0],@b[0],xzr - - mvn @b[0],@b[0] - and @b[0],@b[0],#2 - orr $r_ptr,$r_ptr,@b[0] - - cmp $n_ptr,#0 - csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) - - cmp $a_ptr,#0 - csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) - - and $n_ptr,$n_ptr,#1 - and $a_ptr,$a_ptr,#2 - orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity - - ret -.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x -___ -if (1) { -sub vec_select { -my $sz = shift; -my @v=map("v$_",(0..5,16..21)); - -$code.=<<___; -.globl vec_select_$sz -.hidden vec_select_$sz -.type vec_select_$sz,%function -.align 5 -vec_select_$sz: - dup v6.2d, $n_ptr - ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48 -___ -for($i=0; $i<$sz-48; $i+=48) { -$code.=<<___; - bit @v[0].16b, @v[3].16b, v6.16b - ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48 - bit @v[1].16b, @v[4].16b, v6.16b - ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48 - bit @v[2].16b, @v[5].16b, v6.16b - st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48 -___ - @v = @v[6..11,0..5]; -} -$code.=<<___; - bit @v[0].16b, @v[3].16b, v6.16b - bit @v[1].16b, @v[4].16b, v6.16b - bit @v[2].16b, @v[5].16b, v6.16b - st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr] - ret -.size vec_select_$sz,.-vec_select_$sz -___ -} -vec_select(32); -vec_select(48); -vec_select(96); -vec_select(192); -vec_select(144); -vec_select(288); -} - -{ -my ($inp, $end, $step) = map("x$_", (0..2)); - -$code.=<<___; -.globl vec_prefetch -.hidden vec_prefetch -.type vec_prefetch,%function -.align 5 -vec_prefetch: - add $end, $end, $inp - sub $end, $end, #1 - mov $step, #64 - prfm pldl1keep, [$inp] - add $inp, $inp, $step - cmp $inp, $end - csel $inp, $end, $inp, hi - csel $step, xzr, $step, hi - prfm pldl1keep, [$inp] - add $inp, $inp, $step - cmp $inp, $end - csel $inp, $end, $inp, hi - csel $step, xzr, $step, hi - prfm pldl1keep, [$inp] - add $inp, $inp, $step - cmp $inp, $end - csel $inp, $end, $inp, hi - csel $step, xzr, $step, hi - prfm pldl1keep, [$inp] - add $inp, $inp, $step - cmp $inp, $end - csel $inp, $end, $inp, hi - csel $step, xzr, $step, hi - prfm pldl1keep, [$inp] - add $inp, $inp, $step - cmp $inp, $end - csel $inp, $end, $inp, hi - csel $step, xzr, $step, hi - prfm pldl1keep, [$inp] - add $inp, $inp, $step - cmp $inp, $end - csel $inp, $end, $inp, hi - prfm pldl1keep, [$inp] - ret -.size vec_prefetch,.-vec_prefetch -___ -my $len = $end; - -$code.=<<___; -.globl vec_is_zero_16x -.hidden vec_is_zero_16x -.type vec_is_zero_16x,%function -.align 5 -vec_is_zero_16x: - ld1 {v0.2d}, [$inp], #16 - lsr $len, $len, #4 - sub $len, $len, #1 - cbz $len, .Loop_is_zero_done - -.Loop_is_zero: - ld1 {v1.2d}, [$inp], #16 - orr v0.16b, v0.16b, v1.16b - sub $len, $len, #1 - cbnz $len, .Loop_is_zero - -.Loop_is_zero_done: - dup v1.2d, v0.2d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.2d[0] - mov x0, #1 - cmp x1, #0 - csel x0, x0, xzr, eq - ret -.size vec_is_zero_16x,.-vec_is_zero_16x -___ -} -{ -my ($inp1, $inp2, $len) = map("x$_", (0..2)); - -$code.=<<___; -.globl vec_is_equal_16x -.hidden vec_is_equal_16x -.type vec_is_equal_16x,%function -.align 5 -vec_is_equal_16x: - ld1 {v0.2d}, [$inp1], #16 - ld1 {v1.2d}, [$inp2], #16 - lsr $len, $len, #4 - eor v0.16b, v0.16b, v1.16b - -.Loop_is_equal: - sub $len, $len, #1 - cbz $len, .Loop_is_equal_done - ld1 {v1.2d}, [$inp1], #16 - ld1 {v2.2d}, [$inp2], #16 - eor v1.16b, v1.16b, v2.16b - orr v0.16b, v0.16b, v1.16b - b .Loop_is_equal - nop - -.Loop_is_equal_done: - dup v1.2d, v0.2d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.2d[0] - mov x0, #1 - cmp x1, #0 - csel x0, x0, xzr, eq - ret -.size vec_is_equal_16x,.-vec_is_equal_16x -___ -} - -print $code; - -close STDOUT; diff --git a/crypto/blst_src/asm/add_mod_384-x86_64.pl b/crypto/blst_src/asm/add_mod_384-x86_64.pl deleted file mode 100755 index a196191c108..00000000000 --- a/crypto/blst_src/asm/add_mod_384-x86_64.pl +++ /dev/null @@ -1,1500 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -# common argument layout -($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); -$b_ptr = "%rbx"; - -{ ############################################################## 384 bits add -my @acc=map("%r$_",(8..15, "ax", "bx", "bp")); - push(@acc, $a_ptr); - -$code.=<<___; -.text - -.globl add_mod_384 -.hidden add_mod_384 -.type add_mod_384,\@function,4,"unwind" -.align 32 -add_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - call __add_mod_384 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size add_mod_384,.-add_mod_384 - -.type __add_mod_384,\@abi-omnipotent -.align 32 -__add_mod_384: - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - -__add_mod_384_a_is_loaded: - add 8*0($b_org), @acc[0] - adc 8*1($b_org), @acc[1] - adc 8*2($b_org), @acc[2] - mov @acc[0], @acc[6] - adc 8*3($b_org), @acc[3] - mov @acc[1], @acc[7] - adc 8*4($b_org), @acc[4] - mov @acc[2], @acc[8] - adc 8*5($b_org), @acc[5] - mov @acc[3], @acc[9] - sbb $b_org, $b_org - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - mov @acc[4], @acc[10] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], @acc[11] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $b_org - - cmovc @acc[6], @acc[0] - cmovc @acc[7], @acc[1] - cmovc @acc[8], @acc[2] - mov @acc[0], 8*0($r_ptr) - cmovc @acc[9], @acc[3] - mov @acc[1], 8*1($r_ptr) - cmovc @acc[10], @acc[4] - mov @acc[2], 8*2($r_ptr) - cmovc @acc[11], @acc[5] - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - ret -.size __add_mod_384,.-__add_mod_384 - -.globl add_mod_384x -.hidden add_mod_384x -.type add_mod_384x,\@function,4,"unwind" -.align 32 -add_mod_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$24, %rsp -.cfi_adjust_cfa_offset 24 -.cfi_end_prologue - - mov $a_ptr, 8*0(%rsp) - mov $b_org, 8*1(%rsp) - lea 48($a_ptr), $a_ptr # a->im - lea 48($b_org), $b_org # b->im - lea 48($r_ptr), $r_ptr # ret->im - call __add_mod_384 # add_mod_384(ret->im, a->im, b->im, mod); - - mov 8*0(%rsp), $a_ptr # a->re - mov 8*1(%rsp), $b_org # b->re - lea -48($r_ptr), $r_ptr # ret->re - call __add_mod_384 # add_mod_384(ret->re, a->re, b->re, mod); - - mov 24+8*0(%rsp),%r15 -.cfi_restore %r15 - mov 24+8*1(%rsp),%r14 -.cfi_restore %r14 - mov 24+8*2(%rsp),%r13 -.cfi_restore %r13 - mov 24+8*3(%rsp),%r12 -.cfi_restore %r12 - mov 24+8*4(%rsp),%rbx -.cfi_restore %rbx - mov 24+8*5(%rsp),%rbp -.cfi_restore %rbp - lea 24+8*6(%rsp),%rsp -.cfi_adjust_cfa_offset -24-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size add_mod_384x,.-add_mod_384x - -######################################################################## -.globl rshift_mod_384 -.hidden rshift_mod_384 -.type rshift_mod_384,\@function,4,"unwind" -.align 32 -rshift_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $r_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - -.Loop_rshift_mod_384: - call __rshift_mod_384 - dec %edx - jnz .Loop_rshift_mod_384 - - mov @acc[0], 8*0($r_ptr) - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size rshift_mod_384,.-rshift_mod_384 - -.type __rshift_mod_384,\@abi-omnipotent -.align 32 -__rshift_mod_384: - mov \$1, @acc[11] - mov 8*0($n_ptr), @acc[6] - and @acc[0], @acc[11] - mov 8*1($n_ptr), @acc[7] - neg @acc[11] - mov 8*2($n_ptr), @acc[8] - and @acc[11], @acc[6] - mov 8*3($n_ptr), @acc[9] - and @acc[11], @acc[7] - mov 8*4($n_ptr), @acc[10] - and @acc[11], @acc[8] - and @acc[11], @acc[9] - and @acc[11], @acc[10] - and 8*5($n_ptr), @acc[11] - - add @acc[0], @acc[6] - adc @acc[1], @acc[7] - adc @acc[2], @acc[8] - adc @acc[3], @acc[9] - adc @acc[4], @acc[10] - adc @acc[5], @acc[11] - sbb @acc[5], @acc[5] - - shr \$1, @acc[6] - mov @acc[7], @acc[0] - shr \$1, @acc[7] - mov @acc[8], @acc[1] - shr \$1, @acc[8] - mov @acc[9], @acc[2] - shr \$1, @acc[9] - mov @acc[10], @acc[3] - shr \$1, @acc[10] - mov @acc[11], @acc[4] - shr \$1, @acc[11] - shl \$63, @acc[0] - shl \$63, @acc[1] - or @acc[6], @acc[0] - shl \$63, @acc[2] - or @acc[7], @acc[1] - shl \$63, @acc[3] - or @acc[8], @acc[2] - shl \$63, @acc[4] - or @acc[9], @acc[3] - shl \$63, @acc[5] - or @acc[10], @acc[4] - or @acc[11], @acc[5] - - ret -.size __rshift_mod_384,.-__rshift_mod_384 - -.globl div_by_2_mod_384 -.hidden div_by_2_mod_384 -.type div_by_2_mod_384,\@function,3,"unwind" -.align 32 -div_by_2_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $r_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov $b_org, $n_ptr - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - call __rshift_mod_384 - - mov @acc[0], 8*0($r_ptr) - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size div_by_2_mod_384,.-div_by_2_mod_384 - -######################################################################## -.globl lshift_mod_384 -.hidden lshift_mod_384 -.type lshift_mod_384,\@function,4,"unwind" -.align 32 -lshift_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $r_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - -.Loop_lshift_mod_384: - add @acc[0], @acc[0] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - mov @acc[0], @acc[6] - adc @acc[3], @acc[3] - mov @acc[1], @acc[7] - adc @acc[4], @acc[4] - mov @acc[2], @acc[8] - adc @acc[5], @acc[5] - mov @acc[3], @acc[9] - sbb $r_ptr, $r_ptr - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - mov @acc[4], @acc[10] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], @acc[11] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $r_ptr - - mov (%rsp), $r_ptr - cmovc @acc[6], @acc[0] - cmovc @acc[7], @acc[1] - cmovc @acc[8], @acc[2] - cmovc @acc[9], @acc[3] - cmovc @acc[10], @acc[4] - cmovc @acc[11], @acc[5] - - dec %edx - jnz .Loop_lshift_mod_384 - - mov @acc[0], 8*0($r_ptr) - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size lshift_mod_384,.-lshift_mod_384 - -.type __lshift_mod_384,\@abi-omnipotent -.align 32 -__lshift_mod_384: - add @acc[0], @acc[0] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - mov @acc[0], @acc[6] - adc @acc[3], @acc[3] - mov @acc[1], @acc[7] - adc @acc[4], @acc[4] - mov @acc[2], @acc[8] - adc @acc[5], @acc[5] - mov @acc[3], @acc[9] - sbb $b_org, $b_org - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - mov @acc[4], @acc[10] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], @acc[11] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $b_org - - cmovc @acc[6], @acc[0] - cmovc @acc[7], @acc[1] - cmovc @acc[8], @acc[2] - cmovc @acc[9], @acc[3] - cmovc @acc[10], @acc[4] - cmovc @acc[11], @acc[5] - - ret -.size __lshift_mod_384,.-__lshift_mod_384 - -######################################################################## -.globl mul_by_3_mod_384 -.hidden mul_by_3_mod_384 -.type mul_by_3_mod_384,\@function,3,"unwind" -.align 32 -mul_by_3_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $a_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - mov $b_org, $n_ptr - - call __lshift_mod_384 - - mov (%rsp), $b_org - call __add_mod_384_a_is_loaded - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size mul_by_3_mod_384,.-mul_by_3_mod_384 - -.globl mul_by_8_mod_384 -.hidden mul_by_8_mod_384 -.type mul_by_8_mod_384,\@function,3,"unwind" -.align 32 -mul_by_8_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - mov $b_org, $n_ptr - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - mov @acc[0], 8*0($r_ptr) - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size mul_by_8_mod_384,.-mul_by_8_mod_384 - -######################################################################## -.globl mul_by_3_mod_384x -.hidden mul_by_3_mod_384x -.type mul_by_3_mod_384x,\@function,3,"unwind" -.align 32 -mul_by_3_mod_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $a_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - mov $b_org, $n_ptr - - call __lshift_mod_384 - - mov (%rsp), $b_org - call __add_mod_384_a_is_loaded - - mov (%rsp), $a_ptr - lea 8*6($r_ptr), $r_ptr - - mov 8*6($a_ptr), @acc[0] - mov 8*7($a_ptr), @acc[1] - mov 8*8($a_ptr), @acc[2] - mov 8*9($a_ptr), @acc[3] - mov 8*10($a_ptr), @acc[4] - mov 8*11($a_ptr), @acc[5] - - call __lshift_mod_384 - - mov \$8*6, $b_org - add (%rsp), $b_org - call __add_mod_384_a_is_loaded - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size mul_by_3_mod_384x,.-mul_by_3_mod_384x - -.globl mul_by_8_mod_384x -.hidden mul_by_8_mod_384x -.type mul_by_8_mod_384x,\@function,3,"unwind" -.align 32 -mul_by_8_mod_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $a_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - mov $b_org, $n_ptr - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - mov (%rsp), $a_ptr - mov @acc[0], 8*0($r_ptr) - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - mov 48+8*0($a_ptr), @acc[0] - mov 48+8*1($a_ptr), @acc[1] - mov 48+8*2($a_ptr), @acc[2] - mov 48+8*3($a_ptr), @acc[3] - mov 48+8*4($a_ptr), @acc[4] - mov 48+8*5($a_ptr), @acc[5] - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - mov @acc[0], 48+8*0($r_ptr) - mov @acc[1], 48+8*1($r_ptr) - mov @acc[2], 48+8*2($r_ptr) - mov @acc[3], 48+8*3($r_ptr) - mov @acc[4], 48+8*4($r_ptr) - mov @acc[5], 48+8*5($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size mul_by_8_mod_384x,.-mul_by_8_mod_384x - -######################################################################## -.globl cneg_mod_384 -.hidden cneg_mod_384 -.type cneg_mod_384,\@function,4,"unwind" -.align 32 -cneg_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $b_org # condition flag -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), $b_org # load a[0:5] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov $b_org, @acc[0] - mov 8*3($a_ptr), @acc[3] - or @acc[1], $b_org - mov 8*4($a_ptr), @acc[4] - or @acc[2], $b_org - mov 8*5($a_ptr), @acc[5] - or @acc[3], $b_org - mov \$-1, @acc[11] - or @acc[4], $b_org - or @acc[5], $b_org - - mov 8*0($n_ptr), @acc[6] # load n[0:5] - cmovnz @acc[11], $b_org # mask = a[0:5] ? -1 : 0 - mov 8*1($n_ptr), @acc[7] - mov 8*2($n_ptr), @acc[8] - and $b_org, @acc[6] # n[0:5] &= mask - mov 8*3($n_ptr), @acc[9] - and $b_org, @acc[7] - mov 8*4($n_ptr), @acc[10] - and $b_org, @acc[8] - mov 8*5($n_ptr), @acc[11] - and $b_org, @acc[9] - mov 0(%rsp), $n_ptr # restore condition flag - and $b_org, @acc[10] - and $b_org, @acc[11] - - sub @acc[0], @acc[6] # a[0:5] ? n[0:5]-a[0:5] : 0-0 - sbb @acc[1], @acc[7] - sbb @acc[2], @acc[8] - sbb @acc[3], @acc[9] - sbb @acc[4], @acc[10] - sbb @acc[5], @acc[11] - - or $n_ptr, $n_ptr # check condition flag - - cmovz @acc[0], @acc[6] # flag ? n[0:5]-a[0:5] : a[0:5] - cmovz @acc[1], @acc[7] - cmovz @acc[2], @acc[8] - mov @acc[6], 8*0($r_ptr) - cmovz @acc[3], @acc[9] - mov @acc[7], 8*1($r_ptr) - cmovz @acc[4], @acc[10] - mov @acc[8], 8*2($r_ptr) - cmovz @acc[5], @acc[11] - mov @acc[9], 8*3($r_ptr) - mov @acc[10], 8*4($r_ptr) - mov @acc[11], 8*5($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size cneg_mod_384,.-cneg_mod_384 - -######################################################################## -.globl sub_mod_384 -.hidden sub_mod_384 -.type sub_mod_384,\@function,4,"unwind" -.align 32 -sub_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - call __sub_mod_384 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sub_mod_384,.-sub_mod_384 - -.type __sub_mod_384,\@abi-omnipotent -.align 32 -__sub_mod_384: - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - sub 8*0($b_org), @acc[0] - mov 8*0($n_ptr), @acc[6] - sbb 8*1($b_org), @acc[1] - mov 8*1($n_ptr), @acc[7] - sbb 8*2($b_org), @acc[2] - mov 8*2($n_ptr), @acc[8] - sbb 8*3($b_org), @acc[3] - mov 8*3($n_ptr), @acc[9] - sbb 8*4($b_org), @acc[4] - mov 8*4($n_ptr), @acc[10] - sbb 8*5($b_org), @acc[5] - mov 8*5($n_ptr), @acc[11] - sbb $b_org, $b_org - - and $b_org, @acc[6] - and $b_org, @acc[7] - and $b_org, @acc[8] - and $b_org, @acc[9] - and $b_org, @acc[10] - and $b_org, @acc[11] - - add @acc[6], @acc[0] - adc @acc[7], @acc[1] - mov @acc[0], 8*0($r_ptr) - adc @acc[8], @acc[2] - mov @acc[1], 8*1($r_ptr) - adc @acc[9], @acc[3] - mov @acc[2], 8*2($r_ptr) - adc @acc[10], @acc[4] - mov @acc[3], 8*3($r_ptr) - adc @acc[11], @acc[5] - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - ret -.size __sub_mod_384,.-__sub_mod_384 - -.globl sub_mod_384x -.hidden sub_mod_384x -.type sub_mod_384x,\@function,4,"unwind" -.align 32 -sub_mod_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$24, %rsp -.cfi_adjust_cfa_offset 24 -.cfi_end_prologue - - mov $a_ptr, 8*0(%rsp) - mov $b_org, 8*1(%rsp) - lea 48($a_ptr), $a_ptr # a->im - lea 48($b_org), $b_org # b->im - lea 48($r_ptr), $r_ptr # ret->im - call __sub_mod_384 # sub_mod_384(ret->im, a->im, b->im, mod); - - mov 8*0(%rsp), $a_ptr # a->re - mov 8*1(%rsp), $b_org # b->re - lea -48($r_ptr), $r_ptr # ret->re - call __sub_mod_384 # sub_mod_384(ret->re, a->re, b->re, mod); - - mov 24+8*0(%rsp),%r15 -.cfi_restore %r15 - mov 24+8*1(%rsp),%r14 -.cfi_restore %r14 - mov 24+8*2(%rsp),%r13 -.cfi_restore %r13 - mov 24+8*3(%rsp),%r12 -.cfi_restore %r12 - mov 24+8*4(%rsp),%rbx -.cfi_restore %rbx - mov 24+8*5(%rsp),%rbp -.cfi_restore %rbp - lea 24+8*6(%rsp),%rsp -.cfi_adjust_cfa_offset -24-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size sub_mod_384x,.-sub_mod_384x -___ -} -{ ###################################################### ret = a * (1 + i) -my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx"); -my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp")); - -$code.=<<___; -.globl mul_by_1_plus_i_mod_384x -.hidden mul_by_1_plus_i_mod_384x -.type mul_by_1_plus_i_mod_384x,\@function,3,"unwind" -.align 32 -mul_by_1_plus_i_mod_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$56, %rsp -.cfi_adjust_cfa_offset 56 -.cfi_end_prologue - - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - mov @acc[0], @acc[6] - add 8*6($a_ptr), @acc[0] # a->re + a->im - mov @acc[1], @acc[7] - adc 8*7($a_ptr), @acc[1] - mov @acc[2], @acc[8] - adc 8*8($a_ptr), @acc[2] - mov @acc[3], @acc[9] - adc 8*9($a_ptr), @acc[3] - mov @acc[4], @acc[10] - adc 8*10($a_ptr), @acc[4] - mov @acc[5], @acc[11] - adc 8*11($a_ptr), @acc[5] - mov $r_ptr, 8*6(%rsp) # offload r_ptr - sbb $r_ptr, $r_ptr - - sub 8*6($a_ptr), @acc[6] # a->re - a->im - sbb 8*7($a_ptr), @acc[7] - sbb 8*8($a_ptr), @acc[8] - sbb 8*9($a_ptr), @acc[9] - sbb 8*10($a_ptr), @acc[10] - sbb 8*11($a_ptr), @acc[11] - sbb $a_ptr, $a_ptr - - mov @acc[0], 8*0(%rsp) # offload a->re + a->im [without carry] - mov 8*0($n_ptr), @acc[0] - mov @acc[1], 8*1(%rsp) - mov 8*1($n_ptr), @acc[1] - mov @acc[2], 8*2(%rsp) - mov 8*2($n_ptr), @acc[2] - mov @acc[3], 8*3(%rsp) - mov 8*3($n_ptr), @acc[3] - mov @acc[4], 8*4(%rsp) - and $a_ptr, @acc[0] - mov 8*4($n_ptr), @acc[4] - mov @acc[5], 8*5(%rsp) - and $a_ptr, @acc[1] - mov 8*5($n_ptr), @acc[5] - and $a_ptr, @acc[2] - and $a_ptr, @acc[3] - and $a_ptr, @acc[4] - and $a_ptr, @acc[5] - mov 8*6(%rsp), $a_ptr # restore r_ptr - - add @acc[0], @acc[6] - mov 8*0(%rsp), @acc[0] # restore a->re + a->im - adc @acc[1], @acc[7] - mov 8*1(%rsp), @acc[1] - adc @acc[2], @acc[8] - mov 8*2(%rsp), @acc[2] - adc @acc[3], @acc[9] - mov 8*3(%rsp), @acc[3] - adc @acc[4], @acc[10] - mov 8*4(%rsp), @acc[4] - adc @acc[5], @acc[11] - mov 8*5(%rsp), @acc[5] - - mov @acc[6], 8*0($a_ptr) # ret->re = a->re - a->im - mov @acc[0], @acc[6] - mov @acc[7], 8*1($a_ptr) - mov @acc[8], 8*2($a_ptr) - mov @acc[1], @acc[7] - mov @acc[9], 8*3($a_ptr) - mov @acc[10], 8*4($a_ptr) - mov @acc[2], @acc[8] - mov @acc[11], 8*5($a_ptr) - - sub 8*0($n_ptr), @acc[0] - mov @acc[3], @acc[9] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - mov @acc[4], @acc[10] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], @acc[11] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $r_ptr - - cmovc @acc[6], @acc[0] - cmovc @acc[7], @acc[1] - cmovc @acc[8], @acc[2] - mov @acc[0], 8*6($a_ptr) # ret->im = a->re + a->im - cmovc @acc[9], @acc[3] - mov @acc[1], 8*7($a_ptr) - cmovc @acc[10], @acc[4] - mov @acc[2], 8*8($a_ptr) - cmovc @acc[11], @acc[5] - mov @acc[3], 8*9($a_ptr) - mov @acc[4], 8*10($a_ptr) - mov @acc[5], 8*11($a_ptr) - - mov 56+8*0(%rsp),%r15 -.cfi_restore %r15 - mov 56+8*1(%rsp),%r14 -.cfi_restore %r14 - mov 56+8*2(%rsp),%r13 -.cfi_restore %r13 - mov 56+8*3(%rsp),%r12 -.cfi_restore %r12 - mov 56+8*4(%rsp),%rbx -.cfi_restore %rbx - mov 56+8*5(%rsp),%rbp -.cfi_restore %rbp - lea 56+8*6(%rsp),%rsp -.cfi_adjust_cfa_offset -56-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x -___ -} -{ ###################################################### -my ($r_ptr,$n_ptr) = ("%rdi","%rsi"); -my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp")); - -$code.=<<___; -.globl sgn0_pty_mod_384 -.hidden sgn0_pty_mod_384 -.type sgn0_pty_mod_384,\@function,2,"unwind" -.align 32 -sgn0_pty_mod_384: -.cfi_startproc -.cfi_end_prologue - mov 8*0($r_ptr), @acc[0] - mov 8*1($r_ptr), @acc[1] - mov 8*2($r_ptr), @acc[2] - mov 8*3($r_ptr), @acc[3] - mov 8*4($r_ptr), @acc[4] - mov 8*5($r_ptr), @acc[5] - - xor %rax, %rax - mov @acc[0], $r_ptr - add @acc[0], @acc[0] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, %rax - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, %rax - - not %rax # 2*x > p, which means "negative" - and \$1, $r_ptr - and \$2, %rax - or $r_ptr, %rax # pack sign and parity - -.cfi_epilogue - ret -.cfi_endproc -.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 - -.globl sgn0_pty_mod_384x -.hidden sgn0_pty_mod_384x -.type sgn0_pty_mod_384x,\@function,2,"unwind" -.align 32 -sgn0_pty_mod_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*6($r_ptr), @acc[0] # sgn0(a->im) - mov 8*7($r_ptr), @acc[1] - mov 8*8($r_ptr), @acc[2] - mov 8*9($r_ptr), @acc[3] - mov 8*10($r_ptr), @acc[4] - mov 8*11($r_ptr), @acc[5] - - mov @acc[0], @acc[6] - or @acc[1], @acc[0] - or @acc[2], @acc[0] - or @acc[3], @acc[0] - or @acc[4], @acc[0] - or @acc[5], @acc[0] - - lea 0($r_ptr), %rax # sgn0(a->re) - xor $r_ptr, $r_ptr - mov @acc[6], @acc[7] - add @acc[6], @acc[6] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, $r_ptr - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $r_ptr - - mov @acc[0], 0(%rsp) # a->im is zero or not - not $r_ptr # 2*x > p, which means "negative" - and \$1, @acc[7] - and \$2, $r_ptr - or @acc[7], $r_ptr # pack sign and parity - - mov 8*0(%rax), @acc[0] - mov 8*1(%rax), @acc[1] - mov 8*2(%rax), @acc[2] - mov 8*3(%rax), @acc[3] - mov 8*4(%rax), @acc[4] - mov 8*5(%rax), @acc[5] - - mov @acc[0], @acc[6] - or @acc[1], @acc[0] - or @acc[2], @acc[0] - or @acc[3], @acc[0] - or @acc[4], @acc[0] - or @acc[5], @acc[0] - - xor %rax, %rax - mov @acc[6], @acc[7] - add @acc[6], @acc[6] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, %rax - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, %rax - - mov 0(%rsp), @acc[6] - - not %rax # 2*x > p, which means "negative" - - test @acc[0], @acc[0] - cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) - - test @acc[6], @acc[6] - cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) - - and \$1, @acc[7] - and \$2, %rax - or @acc[7], %rax # pack sign and parity - - mov 8(%rsp), %rbx -.cfi_restore %rbx - mov 16(%rsp), %rbp -.cfi_restore %rbp - lea 24(%rsp), %rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x -___ -} -if (0) { -my $inp = $win64 ? "%rcx" : "%rdi"; -$code.=<<___; -.globl nbits_384 -.hidden nbits_384 -.type nbits_384,\@abi-omnipotent -.align 32 -nbits_384: - mov 8*5($inp), %r8 - mov 8*4($inp), %r9 - mov 8*3($inp), %r10 - mov 8*2($inp), %r11 - mov \$-1, %rdx - mov \$127, %eax - bsr %r8, %r8 - cmovnz %rdx,%r9 - cmovz %rax,%r8 - bsr %r9, %r9 - cmovnz %rdx,%r10 - cmovz %rax,%r9 - xor \$63,%r8 - bsr %r10, %r10 - cmovnz %rdx, %r11 - cmovz %rax, %r10 - xor \$63,%r9 - add %r8, %r9 - mov 8*1($inp), %r8 - bsr %r11, %r11 - cmovnz %rdx, %r8 - cmovz %rax, %r11 - xor \$63, %r10 - add %r9, %r10 - mov 8*0($inp), %r9 - bsr %r8, %r8 - cmovnz %rdx, %r9 - cmovz %rax, %r8 - xor \$63, %r11 - add %r10, %r11 - bsr %r9, %r9 - cmovz %rax, %r9 - xor \$63, %r8 - add %r11, %r8 - xor \$63, %r9 - add %r8, %r9 - mov \$384, %eax - sub %r9, %rax - ret -.size nbits_384,.-nbits_384 -___ -} - -if (1) { -my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d") - : ("%rdi", "%rsi", "%rdx", "%ecx"); - -sub vec_select { -my $sz = shift; -my $half = $sz/2; -my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3)); - -$code.=<<___; -.globl vec_select_$sz -.hidden vec_select_$sz -.type vec_select_$sz,\@abi-omnipotent -.align 32 -vec_select_$sz: - movd $select, %xmm5 - pxor %xmm4,%xmm4 - pshufd \$0,%xmm5,%xmm5 # broadcast - movdqu ($inp1),$xmm0 - lea $half($inp1),$inp1 - pcmpeqd %xmm4,%xmm5 - movdqu ($inp2),$xmm1 - lea $half($inp2),$inp2 - pcmpeqd %xmm5,%xmm4 - lea $half($out),$out -___ -for($i=0; $i<$sz-16; $i+=16) { -$code.=<<___; - pand %xmm4,$xmm0 - movdqu $i+16-$half($inp1),$xmm2 - pand %xmm5,$xmm1 - movdqu $i+16-$half($inp2),$xmm3 - por $xmm1,$xmm0 - movdqu $xmm0,$i-$half($out) -___ - ($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1); -} -$code.=<<___; - pand %xmm4,$xmm0 - pand %xmm5,$xmm1 - por $xmm1,$xmm0 - movdqu $xmm0,$i-$half($out) - ret -.size vec_select_$sz,.-vec_select_$sz -___ -} -vec_select(32); -vec_select(48); -vec_select(96); -vec_select(192); -vec_select(144); -vec_select(288); -} - -{ -my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi"); - -$code.=<<___; -.globl vec_prefetch -.hidden vec_prefetch -.type vec_prefetch,\@abi-omnipotent -.align 32 -vec_prefetch: - leaq -1($inp,$end), $end - mov \$64, %rax - xor %r8, %r8 - prefetchnta ($inp) - lea ($inp,%rax), $inp - cmp $end, $inp - cmova $end, $inp - cmova %r8, %rax - prefetchnta ($inp) - lea ($inp,%rax), $inp - cmp $end, $inp - cmova $end, $inp - cmova %r8, %rax - prefetchnta ($inp) - lea ($inp,%rax), $inp - cmp $end, $inp - cmova $end, $inp - cmova %r8, %rax - prefetchnta ($inp) - lea ($inp,%rax), $inp - cmp $end, $inp - cmova $end, $inp - cmova %r8, %rax - prefetchnta ($inp) - lea ($inp,%rax), $inp - cmp $end, $inp - cmova $end, $inp - cmova %r8, %rax - prefetchnta ($inp) - lea ($inp,%rax), $inp - cmp $end, $inp - cmova $end, $inp - prefetchnta ($inp) - ret -.size vec_prefetch,.-vec_prefetch -___ -my $len = $win64 ? "%edx" : "%esi"; - -$code.=<<___; -.globl vec_is_zero_16x -.hidden vec_is_zero_16x -.type vec_is_zero_16x,\@abi-omnipotent -.align 32 -vec_is_zero_16x: - shr \$4, $len - movdqu ($inp), %xmm0 - lea 16($inp), $inp - -.Loop_is_zero: - dec $len - jz .Loop_is_zero_done - movdqu ($inp), %xmm1 - lea 16($inp), $inp - por %xmm1, %xmm0 - jmp .Loop_is_zero - -.Loop_is_zero_done: - pshufd \$0x4e, %xmm0, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, %rax - inc $len # now it's 1 - test %rax, %rax - cmovnz $len, %eax - xor \$1, %eax - ret -.size vec_is_zero_16x,.-vec_is_zero_16x -___ -} -{ -my ($inp1, $inp2, $len) = $win64 ? ("%rcx", "%rdx", "%r8d") - : ("%rdi", "%rsi", "%edx"); -$code.=<<___; -.globl vec_is_equal_16x -.hidden vec_is_equal_16x -.type vec_is_equal_16x,\@abi-omnipotent -.align 32 -vec_is_equal_16x: - shr \$4, $len - movdqu ($inp1), %xmm0 - movdqu ($inp2), %xmm1 - sub $inp1, $inp2 - lea 16($inp1), $inp1 - pxor %xmm1, %xmm0 - -.Loop_is_equal: - dec $len - jz .Loop_is_equal_done - movdqu ($inp1), %xmm1 - movdqu ($inp1,$inp2), %xmm2 - lea 16($inp1), $inp1 - pxor %xmm2, %xmm1 - por %xmm1, %xmm0 - jmp .Loop_is_equal - -.Loop_is_equal_done: - pshufd \$0x4e, %xmm0, %xmm1 - por %xmm1, %xmm0 - movq %xmm0, %rax - inc $len # now it's 1 - test %rax, %rax - cmovnz $len, %eax - xor \$1, %eax - ret -.size vec_is_equal_16x,.-vec_is_equal_16x -___ -} -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/add_mod_384x384-x86_64.pl b/crypto/blst_src/asm/add_mod_384x384-x86_64.pl deleted file mode 100755 index 6ee3cf8760a..00000000000 --- a/crypto/blst_src/asm/add_mod_384x384-x86_64.pl +++ /dev/null @@ -1,260 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -# common argument layout -($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); -$b_ptr = "%rbx"; - -# common accumulator layout -@acc=map("%r$_",(8..15)); - -############################################################ 384x384 add/sub -# Double-width addition/subtraction modulo n<<384, as opposite to -# naively expected modulo n*n. It works because n<<384 is the actual -# input boundary condition for Montgomery reduction, not n*n. -# Just in case, this is duplicated, but only one module is -# supposed to be linked... -{ -my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected - # except for $n_ptr and $r_ptr -$code.=<<___; -.text - -.type __add_mod_384x384,\@abi-omnipotent -.align 32 -__add_mod_384x384: - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - mov 8*6($a_ptr), @acc[6] - - add 8*0($b_org), @acc[0] - mov 8*7($a_ptr), @acc[7] - adc 8*1($b_org), @acc[1] - mov 8*8($a_ptr), @acc[8] - adc 8*2($b_org), @acc[2] - mov 8*9($a_ptr), @acc[9] - adc 8*3($b_org), @acc[3] - mov 8*10($a_ptr), @acc[10] - adc 8*4($b_org), @acc[4] - mov 8*11($a_ptr), @acc[11] - adc 8*5($b_org), @acc[5] - mov @acc[0], 8*0($r_ptr) - adc 8*6($b_org), @acc[6] - mov @acc[1], 8*1($r_ptr) - adc 8*7($b_org), @acc[7] - mov @acc[2], 8*2($r_ptr) - adc 8*8($b_org), @acc[8] - mov @acc[4], 8*4($r_ptr) - mov @acc[6], @acc[0] - adc 8*9($b_org), @acc[9] - mov @acc[3], 8*3($r_ptr) - mov @acc[7], @acc[1] - adc 8*10($b_org), @acc[10] - mov @acc[5], 8*5($r_ptr) - mov @acc[8], @acc[2] - adc 8*11($b_org), @acc[11] - mov @acc[9], @acc[3] - sbb $b_org, $b_org - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[7] - mov @acc[10], @acc[4] - sbb 8*2($n_ptr), @acc[8] - sbb 8*3($n_ptr), @acc[9] - sbb 8*4($n_ptr), @acc[10] - mov @acc[11], @acc[5] - sbb 8*5($n_ptr), @acc[11] - sbb \$0, $b_org - - cmovc @acc[0], @acc[6] - cmovc @acc[1], @acc[7] - cmovc @acc[2], @acc[8] - mov @acc[6], 8*6($r_ptr) - cmovc @acc[3], @acc[9] - mov @acc[7], 8*7($r_ptr) - cmovc @acc[4], @acc[10] - mov @acc[8], 8*8($r_ptr) - cmovc @acc[5], @acc[11] - mov @acc[9], 8*9($r_ptr) - mov @acc[10], 8*10($r_ptr) - mov @acc[11], 8*11($r_ptr) - - ret -.size __add_mod_384x384,.-__add_mod_384x384 - -.type __sub_mod_384x384,\@abi-omnipotent -.align 32 -__sub_mod_384x384: - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - mov 8*6($a_ptr), @acc[6] - - sub 8*0($b_org), @acc[0] - mov 8*7($a_ptr), @acc[7] - sbb 8*1($b_org), @acc[1] - mov 8*8($a_ptr), @acc[8] - sbb 8*2($b_org), @acc[2] - mov 8*9($a_ptr), @acc[9] - sbb 8*3($b_org), @acc[3] - mov 8*10($a_ptr), @acc[10] - sbb 8*4($b_org), @acc[4] - mov 8*11($a_ptr), @acc[11] - sbb 8*5($b_org), @acc[5] - mov @acc[0], 8*0($r_ptr) - sbb 8*6($b_org), @acc[6] - mov 8*0($n_ptr), @acc[0] - mov @acc[1], 8*1($r_ptr) - sbb 8*7($b_org), @acc[7] - mov 8*1($n_ptr), @acc[1] - mov @acc[2], 8*2($r_ptr) - sbb 8*8($b_org), @acc[8] - mov 8*2($n_ptr), @acc[2] - mov @acc[3], 8*3($r_ptr) - sbb 8*9($b_org), @acc[9] - mov 8*3($n_ptr), @acc[3] - mov @acc[4], 8*4($r_ptr) - sbb 8*10($b_org), @acc[10] - mov 8*4($n_ptr), @acc[4] - mov @acc[5], 8*5($r_ptr) - sbb 8*11($b_org), @acc[11] - mov 8*5($n_ptr), @acc[5] - sbb $b_org, $b_org - - and $b_org, @acc[0] - and $b_org, @acc[1] - and $b_org, @acc[2] - and $b_org, @acc[3] - and $b_org, @acc[4] - and $b_org, @acc[5] - - add @acc[0], @acc[6] - adc @acc[1], @acc[7] - mov @acc[6], 8*6($r_ptr) - adc @acc[2], @acc[8] - mov @acc[7], 8*7($r_ptr) - adc @acc[3], @acc[9] - mov @acc[8], 8*8($r_ptr) - adc @acc[4], @acc[10] - mov @acc[9], 8*9($r_ptr) - adc @acc[5], @acc[11] - mov @acc[10], 8*10($r_ptr) - mov @acc[11], 8*11($r_ptr) - - ret -.size __sub_mod_384x384,.-__sub_mod_384x384 - -.globl add_mod_384x384 -.hidden add_mod_384x384 -.type add_mod_384x384,\@function,4,"unwind" -.align 32 -add_mod_384x384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - call __add_mod_384x384 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size add_mod_384x384,.-add_mod_384x384 - -.globl sub_mod_384x384 -.hidden sub_mod_384x384 -.type sub_mod_384x384,\@function,4,"unwind" -.align 32 -sub_mod_384x384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - call __sub_mod_384x384 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sub_mod_384x384,.-sub_mod_384x384 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/arm-xlate.pl b/crypto/blst_src/asm/arm-xlate.pl deleted file mode 100755 index 35aab37407b..00000000000 --- a/crypto/blst_src/asm/arm-xlate.pl +++ /dev/null @@ -1,386 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# ARM assembler distiller/adapter by \@dot-asm. - -use strict; - -################################################################ -# Recognized "flavour"-s are: -# -# linux[32|64] GNU assembler, effectively pass-through -# ios[32|64] global symbols' decorations, PIC tweaks, etc. -# win[32|64] Visual Studio armasm-specific directives -# coff[32|64] e.g. clang --target=arm-windows ... -# -my $flavour = shift; - $flavour = "linux" if (!$flavour or $flavour eq "void"); - -my $output = shift; -open STDOUT,">$output" || die "can't open $output: $!"; - -my %GLOBALS; -my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0; -my $in_proc; # used with 'windows' flavour - -################################################################ -# directives which need special treatment on different platforms -################################################################ -my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch -my $fpu = sub { } if ($flavour !~ /linux/); # omit .fpu - -my $rodata = sub { - SWITCH: for ($flavour) { - /linux/ && return ".section\t.rodata"; - /ios/ && return ".section\t__TEXT,__const"; - /coff/ && return ".section\t.rdata,\"dr\""; - /win/ && return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8"; - last; - } -}; - -my $hidden = sub { - if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } -} if ($flavour !~ /linux/); - -my $comm = sub { - my @args = split(/,\s*/,shift); - my $name = @args[0]; - my $global = \$GLOBALS{$name}; - my $ret; - - if ($flavour =~ /ios32/) { - $ret = ".comm\t_$name,@args[1]\n"; - $ret .= ".non_lazy_symbol_pointer\n"; - $ret .= "$name:\n"; - $ret .= ".indirect_symbol\t_$name\n"; - $ret .= ".long\t0\n"; - $ret .= ".previous"; - $name = "_$name"; - } elsif ($flavour =~ /win/) { - $ret = "\tCOMMON\t|$name|,@args[1]"; - } elsif ($flavour =~ /coff/) { - $ret = ".comm\t$name,@args[1]"; - } else { - $ret = ".comm\t".join(',',@args); - } - - $$global = $name; - $ret; -}; - -my $globl = sub { - my $name = shift; - my $global = \$GLOBALS{$name}; - my $ret; - - SWITCH: for ($flavour) { - /ios/ && do { $name = "_$name"; last; }; - /win/ && do { $ret = ""; last; }; - } - - $ret = ".globl $name" if (!defined($ret)); - $$global = $name; - $ret; -}; -my $global = $globl; - -my $extern = sub { - &$globl(@_); - if ($flavour =~ /win/) { - return "\tEXTERN\t@_"; - } - return; # return nothing -}; - -my $type = sub { - my $arg = join(',',@_); - my $ret; - - SWITCH: for ($flavour) { - /ios32/ && do { if ($arg =~ /(\w+),\s*%function/) { - $ret = "#ifdef __thumb2__\n" . - ".thumb_func $1\n" . - "#endif"; - } - last; - }; - /win/ && do { if ($arg =~ /(\w+),\s*%(function|object)/) { - my $type = "[DATA]"; - if ($2 eq "function") { - $in_proc = $1; - $type = "[FUNC]"; - } - $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type" - : ""; - } - last; - }; - /coff/ && do { if ($arg =~ /(\w+),\s*%function/) { - $ret = ".def $1;\n". - ".type 32;\n". - ".endef"; - } - last; - }; - } - return $ret; -} if ($flavour !~ /linux/); - -my $size = sub { - if ($in_proc && $flavour =~ /win/) { - $in_proc = undef; - return "\tENDP"; - } -} if ($flavour !~ /linux/); - -my $inst = sub { - if ($flavour =~ /win/) { "\tDCDU\t".join(',',@_); } - else { ".long\t".join(',',@_); } -} if ($flavour !~ /linux/); - -my $asciz = sub { - my $line = join(",",@_); - if ($line =~ /^"(.*)"$/) - { if ($flavour =~ /win/) { - "\tDCB\t$line,0\n\tALIGN\t4"; - } else { - ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; - } - } else { ""; } -}; - -my $align = sub { - "\tALIGN\t".2**@_[0]; -} if ($flavour =~ /win/); - $align = sub { - ".p2align\t".@_[0]; -} if ($flavour =~ /coff/); - -my $byte = sub { - "\tDCB\t".join(',',@_); -} if ($flavour =~ /win/); - -my $short = sub { - "\tDCWU\t".join(',',@_); -} if ($flavour =~ /win/); - -my $word = sub { - "\tDCDU\t".join(',',@_); -} if ($flavour =~ /win/); - -my $long = $word if ($flavour =~ /win/); - -my $quad = sub { - "\tDCQU\t".join(',',@_); -} if ($flavour =~ /win/); - -my $skip = sub { - "\tSPACE\t".shift; -} if ($flavour =~ /win/); - -my $code = sub { - "\tCODE@_[0]"; -} if ($flavour =~ /win/); - -my $thumb = sub { # .thumb should appear prior .text in source - "# define ARM THUMB\n" . - "\tTHUMB"; -} if ($flavour =~ /win/); - -my $text = sub { - "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM"); -} if ($flavour =~ /win/); - -my $syntax = sub {} if ($flavour =~ /win/); # omit .syntax - -my $rva = sub { - # .rva directive comes in handy only on 32-bit Windows, i.e. it can - # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections. - # However! Corresponding compilers don't seem to bet on PIC, which - # raises the question why would assembler programmer have to jump - # through the hoops? But just in case, it would go as following: - # - # ldr r1,.LOPENSSL_armcap - # ldr r2,.LOPENSSL_armcap+4 - # adr r0,.LOPENSSL_armcap - # bic r1,r1,#1 ; de-thumb-ify link.exe's ideas - # sub r0,r0,r1 ; r0 is image base now - # ldr r0,[r0,r2] - # ... - #.LOPENSSL_armcap: - # .rva .LOPENSSL_armcap ; self-reference - # .rva OPENSSL_armcap_P ; real target - # - # Non-position-independent [and ISA-neutral] alternative is so much - # simpler: - # - # ldr r0,.LOPENSSL_armcap - # ldr r0,[r0] - # ... - #.LOPENSSL_armcap: - # .long OPENSSL_armcap_P - # - "\tDCDU\t@_[0]\n\tRELOC\t2" -} if ($flavour =~ /win(?!64)/); - -################################################################ -# some broken instructions in Visual Studio armasm[64]... - -my $it = sub {} if ($flavour =~ /win32/); # omit 'it' - -my $ext = sub { - "\text8\t".join(',',@_); -} if ($flavour =~ /win64/); - -my $csel = sub { - my ($args,$comment) = split(m|\s*//|,shift); - my @regs = split(m|,\s*|,$args); - my $cond = pop(@regs); - - "\tcsel$cond\t".join(',',@regs); -} if ($flavour =~ /win64/); - -my $csetm = sub { - my ($args,$comment) = split(m|\s*//|,shift); - my @regs = split(m|,\s*|,$args); - my $cond = pop(@regs); - - "\tcsetm$cond\t".join(',',@regs); -} if ($flavour =~ /win64/); - -# ... then conditional branch instructions are also broken, but -# maintaining all the variants is tedious, so I kludge-fix it -# elsewhere... -################################################################ -my $adrp = sub { - my ($args,$comment) = split(m|\s*//|,shift); - "\tadrp\t$args\@PAGE"; -} if ($flavour =~ /ios64/); - -my $paciasp = sub { - ($flavour =~ /linux/) ? "\t.inst\t0xd503233f" - : &$inst(0xd503233f); -}; - -my $autiasp = sub { - ($flavour =~ /linux/) ? "\t.inst\t0xd50323bf" - : &$inst(0xd50323bf); -}; - -sub range { - my ($r,$sfx,$start,$end) = @_; - - join(",",map("$r$_$sfx",($start..$end))); -} - -sub expand_line { - my $line = shift; - my @ret = (); - - pos($line)=0; - - while ($line =~ m/\G[^@\/\{\"]*/g) { - if ($line =~ m/\G(@|\/\/|$)/gc) { - last; - } - elsif ($line =~ m/\G\{/gc) { - my $saved_pos = pos($line); - $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; - pos($line) = $saved_pos; - $line =~ m/\G[^\}]*\}/g; - } - elsif ($line =~ m/\G\"/gc) { - $line =~ m/\G[^\"]*\"/g; - } - } - - $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; - - if ($flavour =~ /win/) { - # adjust alignment hints, "[rN,:32]" -> "[rN@32]" - $line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/; - # adjust local labels, ".Lwhatever" -> "|$Lwhatever|" - $line =~ s/\.(L\w{2,})/|\$$1|/g; - # omit "#:lo12:" on win64 - $line =~ s/#:lo12://; - } elsif ($flavour =~ /coff(?!64)/) { - $line =~ s/\.L(\w{2,})/(\$ML$1)/g; - } elsif ($flavour =~ /ios64/) { - $line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/; - } - - if ($flavour =~ /64/) { - # "vX.Md[N]" -> "vX.d[N] - $line =~ s/\b(v[0-9]+)\.[1-9]+([bhsd]\[[0-9]+\])/$1.$2/; - } - - return $line; -} - -while(my $line=<>) { - - # fix up assembler-specific commentary delimiter - $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/); - - if ($line =~ m/^\s*(#|@|;|\/\/)/) { print $line; next; } - - $line =~ s|/\*.*\*/||; # get rid of C-style comments... - $line =~ s|^\s+||; # ... and skip white spaces in beginning... - $line =~ s|\s+$||; # ... and at the end - - { - $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel - $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); - } - - { - $line =~ s|(^[\.\w]+)\:\s*||; - my $label = $1; - if ($label) { - $label = ($GLOBALS{$label} or $label); - if ($flavour =~ /win/) { - $label =~ s|^\.L(?=\w)|\$L|; - printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : ""); - } else { - $label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/); - printf "%s:", $label; - } - } - } - - if ($line !~ m/^[#@;]/) { - $line =~ s|^\s*(\.?)(\S+)\s*||; - my $c = $1; $c = "\t" if ($c eq ""); - my $mnemonic = $2; - my $opcode; - if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { - $opcode = eval("\$$1_$2"); - } else { - $opcode = eval("\$$mnemonic"); - } - - my $arg=expand_line($line); - - if (ref($opcode) eq 'CODE') { - $line = &$opcode($arg); - } elsif ($mnemonic) { - if ($flavour =~ /win64/) { - # "b.cond" -> "bcond", kludge-fix:-( - $mnemonic =~ s/^b\.([a-z]{2}$)/b$1/; - } - $line = $c.$mnemonic; - $line.= "\t$arg" if ($arg ne ""); - } - } - - print $line if ($line); - print "\n"; -} - -print "\tEND\n" if ($flavour =~ /win/); - -close STDOUT; diff --git a/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl b/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl deleted file mode 100755 index ced8c6c37e9..00000000000 --- a/crypto/blst_src/asm/ct_inverse_mod_256-armv8.pl +++ /dev/null @@ -1,586 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Both constant-time and fast Euclidean inversion as suggested in -# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 - -# on Cortex-A57. -# -# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, -# const vec256 modx); -# -$python_ref.=<<'___'; -def ct_inverse_mod_256(inp, mod): - a, u = inp, 1 - b, v = mod, 0 - - k = 31 - mask = (1 << k) - 1 - - for i in range(0, 512 // k - 1): - # __ab_approximation_31 - n = max(a.bit_length(), b.bit_length()) - if n < 64: - a_, b_ = a, b - else: - a_ = (a & mask) | ((a >> (n-k-2)) << k) - b_ = (b & mask) | ((b >> (n-k-2)) << k) - - # __inner_loop_31 - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, k): - if a_ & 1: - if a_ < b_: - a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 - a_, f0, g0 = a_-b_, f0-f1, g0-g1 - a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 - - # __smul_256_n_shift_by_31 - a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k - if a < 0: - a, f0, g0 = -a, -f0, -g0 - if b < 0: - b, f1, g1 = -b, -f1, -g1 - - # __smul_512x63 - u, v = u*f0 + v*g0, u*f1 + v*g1 - - if 512 % k + k: - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, 512 % k + k): - if a & 1: - if a < b: - a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 - a, f0, g0 = a-b, f0-f1, g0-g1 - a, f1, g1 = a >> 1, f1 << 1, g1 << 1 - - v = u*f1 + v*g1 - - mod <<= 512 - mod.bit_length() # align to the left - if v < 0: - v += mod - if v < 0: - v += mod - elif v == 1<<512 - v -= mod - - return v & (2**512 - 1) # to be reduced % mod -___ - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); -my @acc=map("x$_",(4..11)); -my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17)); -my $cnt = $n_ptr; -my @t = map("x$_",(19..26)); -my ($a_lo, $b_lo) = @acc[3,7]; - -$frame = 16+2*512; - -$code.=<<___; -.text - -.globl ct_inverse_mod_256 -.type ct_inverse_mod_256, %function -.align 5 -ct_inverse_mod_256: - paciasp - stp x29, x30, [sp,#-80]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - sub sp, sp, #$frame - - ldp @acc[0], @acc[1], [$in_ptr,#8*0] - ldp @acc[2], @acc[3], [$in_ptr,#8*2] - - add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot - and $in_ptr, $in_ptr, #-512 // in the frame... - str $out_ptr, [sp] - - ldp @acc[4], @acc[5], [$n_ptr,#8*0] - ldp @acc[6], @acc[7], [$n_ptr,#8*2] - - stp @acc[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| - stp @acc[2], @acc[3], [$in_ptr,#8*2] - stp @acc[4], @acc[5], [$in_ptr,#8*4] // copy modulus to |b| - stp @acc[6], @acc[7], [$in_ptr,#8*6] - - ////////////////////////////////////////// first iteration - bl .Lab_approximation_31_256_loaded - - eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - str $f0,[$out_ptr,#8*8] // initialize |u| with |f0| - - mov $f0, $f1 // |f1| - mov $g0, $g1 // |g1| - add $out_ptr, $out_ptr, #8*4 // pointer to dst |b| - bl __smul_256_n_shift_by_31 - str $f0, [$out_ptr,#8*9] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov $f_, $f0 // corrected |f0| - mov $g_, $g0 // corrected |g0| - - mov $f0, $f1 // |f1| - mov $g0, $g1 // |g1| - add $out_ptr, $out_ptr, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - ldr @acc[4], [$in_ptr,#8*8] // |u| - ldr @acc[5], [$in_ptr,#8*13] // |v| - madd @acc[0], $f_, @acc[4], xzr // |u|*|f0| - madd @acc[0], $g_, @acc[5], @acc[0] // |v|*|g0| - str @acc[0], [$out_ptr,#8*4] - asr @acc[1], @acc[0], #63 // sign extenstion - stp @acc[1], @acc[1], [$out_ptr,#8*5] - stp @acc[1], @acc[1], [$out_ptr,#8*7] - - madd @acc[0], $f0, @acc[4], xzr // |u|*|f1| - madd @acc[0], $g0, @acc[5], @acc[0] // |v|*|g1| - str @acc[0], [$out_ptr,#8*9] - asr @acc[1], @acc[0], #63 // sign extenstion - stp @acc[1], @acc[1], [$out_ptr,#8*10] - stp @acc[1], @acc[1], [$out_ptr,#8*12] -___ -for($i=2; $i<15; $i++) { -$code.=<<___; - eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov $f_, $f0 // corrected |f0| - mov $g_, $g0 // corrected |g0| - - mov $f0, $f1 // |f1| - mov $g0, $g1 // |g1| - add $out_ptr, $out_ptr, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add $out_ptr, $out_ptr, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc @t[3], @t[3], @t[4] - str @t[3], [$out_ptr,#8*4] - - mov $f_, $f0 // corrected |f1| - mov $g_, $g0 // corrected |g1| - add $out_ptr, $out_ptr, #8*5 // pointer to destination |v| - bl __smul_256x63 -___ -$code.=<<___ if ($i>7); - bl __smul_512x63_tail -___ -$code.=<<___ if ($i<=7); - adc @t[3], @t[3], @t[4] - stp @t[3], @t[3], [$out_ptr,#8*4] - stp @t[3], @t[3], [$out_ptr,#8*6] -___ -} -$code.=<<___; - ////////////////////////////////////////// two[!] last iterations - eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| - mov $cnt, #47 // 31 + 512 % 31 - //bl __ab_approximation_62_256 // |a| and |b| are exact, - ldr $a_lo, [$in_ptr,#8*0] // just load - ldr $b_lo, [$in_ptr,#8*4] - bl __inner_loop_62_256 - - mov $f_, $f1 - mov $g_, $g1 - ldr $out_ptr, [sp] // original out_ptr - bl __smul_256x63 - bl __smul_512x63_tail - ldr x30, [x29,#8] - - smulh @t[1], @acc[3], $g_ // figure out top-most limb - ldp @acc[4], @acc[5], [$nx_ptr,#8*0] - adc @t[4], @t[4], @t[6] - ldp @acc[6], @acc[7], [$nx_ptr,#8*2] - - add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1 - asr @t[0], @t[1], #63 // sign as mask - - and @t[4], @acc[4], @t[0] // add mod<<256 conditionally - and @t[5], @acc[5], @t[0] - adds @acc[0], @acc[0], @t[4] - and @t[6], @acc[6], @t[0] - adcs @acc[1], @acc[1], @t[5] - and @t[7], @acc[7], @t[0] - adcs @acc[2], @acc[2], @t[6] - adcs @acc[3], @t[3], @t[7] - adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1 - - neg @t[0], @t[1] - orr @t[1], @t[1], @t[0] // excess bit or sign as mask - asr @t[0], @t[0], #63 // excess bit as mask - - and @acc[4], @acc[4], @t[1] // mask |mod| - and @acc[5], @acc[5], @t[1] - and @acc[6], @acc[6], @t[1] - and @acc[7], @acc[7], @t[1] - - eor @acc[4], @acc[4], @t[0] // conditionally negate |mod| - eor @acc[5], @acc[5], @t[0] - adds @acc[4], @acc[4], @t[0], lsr#63 - eor @acc[6], @acc[6], @t[0] - adcs @acc[5], @acc[5], xzr - eor @acc[7], @acc[7], @t[0] - adcs @acc[6], @acc[6], xzr - adc @acc[7], @acc[7], xzr - - adds @acc[0], @acc[0], @acc[4] // final adjustment for |mod|<<256 - adcs @acc[1], @acc[1], @acc[5] - adcs @acc[2], @acc[2], @acc[6] - stp @acc[0], @acc[1], [$out_ptr,#8*4] - adc @acc[3], @acc[3], @acc[7] - stp @acc[2], @acc[3], [$out_ptr,#8*6] - - add sp, sp, #$frame - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldr x29, [sp],#80 - autiasp - ret -.size ct_inverse_mod_256,.-ct_inverse_mod_256 - -//////////////////////////////////////////////////////////////////////// -.type __smul_256x63, %function -.align 5 -__smul_256x63: -___ -for($j=0; $j<2; $j++) { -my $f_ = $f_; $f_ = $g_ if ($j); -my @acc = @acc; @acc = @acc[4..7] if ($j); -my $k = 8*8+8*5*$j; -$code.=<<___; - ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) - asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) - ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] - eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) - ldr @t[3+$j], [$in_ptr,#8*4+$k] - - eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) - sub $f_, $f_, $f1 - eor @acc[1], @acc[1], $f1 - adds @acc[0], @acc[0], $f1, lsr#63 - eor @acc[2], @acc[2], $f1 - adcs @acc[1], @acc[1], xzr - eor @acc[3], @acc[3], $f1 - adcs @acc[2], @acc[2], xzr - eor @t[3+$j], @t[3+$j], $f1 - umulh @t[0], @acc[0], $f_ - adcs @acc[3], @acc[3], xzr - umulh @t[1], @acc[1], $f_ - adcs @t[3+$j], @t[3+$j], xzr - umulh @t[2], @acc[2], $f_ -___ -$code.=<<___ if ($j!=0); - adc $g1, xzr, xzr // used in __smul_512x63_tail -___ -$code.=<<___; - mul @acc[0], @acc[0], $f_ - cmp $f_, #0 - mul @acc[1], @acc[1], $f_ - csel @t[3+$j], @t[3+$j], xzr, ne - mul @acc[2], @acc[2], $f_ - adds @acc[1], @acc[1], @t[0] - mul @t[5+$j], @acc[3], $f_ - adcs @acc[2], @acc[2], @t[1] - adcs @t[5+$j], @t[5+$j], @t[2] -___ -$code.=<<___ if ($j==0); - adc @t[7], xzr, xzr -___ -} -$code.=<<___; - adc @t[7], @t[7], xzr - - adds @acc[0], @acc[0], @acc[4] - adcs @acc[1], @acc[1], @acc[5] - adcs @acc[2], @acc[2], @acc[6] - stp @acc[0], @acc[1], [$out_ptr,#8*0] - adcs @t[5], @t[5], @t[6] - stp @acc[2], @t[5], [$out_ptr,#8*2] - - ret -.size __smul_256x63,.-__smul_256x63 - -.type __smul_512x63_tail, %function -.align 5 -__smul_512x63_tail: - umulh @t[5], @acc[3], $f_ - ldp @acc[1], @acc[2], [$in_ptr,#8*18] // load rest of |v| - adc @t[7], @t[7], xzr - ldr @acc[3], [$in_ptr,#8*20] - and @t[3], @t[3], $f_ - - umulh @acc[7], @acc[7], $g_ // resume |v|*|g1| chain - - sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain - asr @t[6], @t[5], #63 - - eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v| - eor @acc[2], @acc[2], $f1 - adds @acc[1], @acc[1], $g1 - eor @acc[3], @acc[3], $f1 - adcs @acc[2], @acc[2], xzr - umulh @t[0], @t[4], $g_ - adc @acc[3], @acc[3], xzr - umulh @t[1], @acc[1], $g_ - add @acc[7], @acc[7], @t[7] - umulh @t[2], @acc[2], $g_ - - mul @acc[0], @t[4], $g_ - mul @acc[1], @acc[1], $g_ - adds @acc[0], @acc[0], @acc[7] - mul @acc[2], @acc[2], $g_ - adcs @acc[1], @acc[1], @t[0] - mul @t[3], @acc[3], $g_ - adcs @acc[2], @acc[2], @t[1] - adcs @t[3], @t[3], @t[2] - adc @t[4], xzr, xzr // used in the final step - - adds @acc[0], @acc[0], @t[5] - adcs @acc[1], @acc[1], @t[6] - adcs @acc[2], @acc[2], @t[6] - stp @acc[0], @acc[1], [$out_ptr,#8*4] - adcs @t[3], @t[3], @t[6] // carry is used in the final step - stp @acc[2], @t[3], [$out_ptr,#8*6] - - ret -.size __smul_512x63_tail,.-__smul_512x63_tail - -.type __smul_256_n_shift_by_31, %function -.align 5 -__smul_256_n_shift_by_31: -___ -for($j=0; $j<2; $j++) { -my $f0 = $f0; $f0 = $g0 if ($j); -my @acc = @acc; @acc = @acc[4..7] if ($j); -my $k = 8*4*$j; -$code.=<<___; - ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) - asr @t[5], $f0, #63 // |f0|'s sign as mask (or |g0|'s) - ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] - eor @t[6], $f0, @t[5] // conditionally negate |f0| (or |g0|) - - eor @acc[0], @acc[0], @t[5] // conditionally negate |a| (or |b|) - sub @t[6], @t[6], @t[5] - eor @acc[1], @acc[1], @t[5] - adds @acc[0], @acc[0], @t[5], lsr#63 - eor @acc[2], @acc[2], @t[5] - adcs @acc[1], @acc[1], xzr - eor @acc[3], @acc[3], @t[5] - umulh @t[0], @acc[0], @t[6] - adcs @acc[2], @acc[2], xzr - umulh @t[1], @acc[1], @t[6] - adc @acc[3], @acc[3], xzr - umulh @t[2], @acc[2], @t[6] - and @t[5], @t[5], @t[6] - umulh @t[3+$j], @acc[3], @t[6] - neg @t[5], @t[5] - - mul @acc[0], @acc[0], @t[6] - mul @acc[1], @acc[1], @t[6] - mul @acc[2], @acc[2], @t[6] - adds @acc[1], @acc[1], @t[0] - mul @acc[3], @acc[3], @t[6] - adcs @acc[2], @acc[2], @t[1] - adcs @acc[3], @acc[3], @t[2] - adc @t[3+$j], @t[3+$j], @t[5] -___ -} -$code.=<<___; - adds @acc[0], @acc[0], @acc[4] - adcs @acc[1], @acc[1], @acc[5] - adcs @acc[2], @acc[2], @acc[6] - adcs @acc[3], @acc[3], @acc[7] - adc @acc[4], @t[3], @t[4] - - extr @acc[0], @acc[1], @acc[0], #31 - extr @acc[1], @acc[2], @acc[1], #31 - extr @acc[2], @acc[3], @acc[2], #31 - asr @t[4], @acc[4], #63 // result's sign as mask - extr @acc[3], @acc[4], @acc[3], #31 - - eor @acc[0], @acc[0], @t[4] // ensure the result is positive - eor @acc[1], @acc[1], @t[4] - adds @acc[0], @acc[0], @t[4], lsr#63 - eor @acc[2], @acc[2], @t[4] - adcs @acc[1], @acc[1], xzr - eor @acc[3], @acc[3], @t[4] - adcs @acc[2], @acc[2], xzr - stp @acc[0], @acc[1], [$out_ptr,#8*0] - adc @acc[3], @acc[3], xzr - stp @acc[2], @acc[3], [$out_ptr,#8*2] - - eor $f0, $f0, @t[4] // adjust |f/g| accordingly - eor $g0, $g0, @t[4] - sub $f0, $f0, @t[4] - sub $g0, $g0, @t[4] - - ret -.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 -___ - -{ -my @a = @acc[0..3]; -my @b = @acc[4..7]; -my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]); - -$code.=<<___; -.type __ab_approximation_31_256, %function -.align 4 -__ab_approximation_31_256: - ldp @a[2], @a[3], [$in_ptr,#8*2] - ldp @b[2], @b[3], [$in_ptr,#8*6] - ldp @a[0], @a[1], [$in_ptr,#8*0] - ldp @b[0], @b[1], [$in_ptr,#8*4] - -.Lab_approximation_31_256_loaded: - orr @t[0], @a[3], @b[3] // check top-most limbs, ... - cmp @t[0], #0 - csel @a[3], @a[3], @a[2], ne - csel @b[3], @b[3], @b[2], ne - csel @a[2], @a[2], @a[1], ne - orr @t[0], @a[3], @b[3] // and ones before top-most, ... - csel @b[2], @b[2], @b[1], ne - - cmp @t[0], #0 - csel @a[3], @a[3], @a[2], ne - csel @b[3], @b[3], @b[2], ne - csel @a[2], @a[2], @a[0], ne - orr @t[0], @a[3], @b[3] // and one more, ... - csel @b[2], @b[2], @b[0], ne - - clz @t[0], @t[0] - cmp @t[0], #64 - csel @t[0], @t[0], xzr, ne - csel @a[3], @a[3], @a[2], ne - csel @b[3], @b[3], @b[2], ne - neg @t[1], @t[0] - - lslv @a[3], @a[3], @t[0] // align high limbs to the left - lslv @b[3], @b[3], @t[0] - lsrv @a[2], @a[2], @t[1] - lsrv @b[2], @b[2], @t[1] - and @a[2], @a[2], @t[1], asr#6 - and @b[2], @b[2], @t[1], asr#6 - orr $a_lo, @a[3], @a[2] - orr $b_lo, @b[3], @b[2] - - bfxil $a_lo, @a[0], #0, #31 - bfxil $b_lo, @b[0], #0, #31 - - b __inner_loop_31_256 - ret -.size __ab_approximation_31_256,.-__ab_approximation_31_256 - -.type __inner_loop_31_256, %function -.align 4 -__inner_loop_31_256: - mov $cnt, #31 - mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov $bias,#0x7FFFFFFF7FFFFFFF - -.Loop_31_256: - sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting - sub $cnt, $cnt, #1 - and @t[0], $b_lo, @t[3] - sub @t[1], $b_lo, $a_lo // |b_|-|a_| - subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov @t[0], $fg1 - csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| - csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| - csel $fg0, $fg0, @t[0], hs - lsr $a_lo, $a_lo, #1 - and @t[0], $fg1, @t[3] - and @t[1], $bias, @t[3] - sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add $fg1, $fg1, $fg1 // |f1|<<=1 - add $fg0, $fg0, @t[1] - sub $fg1, $fg1, $bias - cbnz $cnt, .Loop_31_256 - - mov $bias, #0x7FFFFFFF - ubfx $f0, $fg0, #0, #32 - ubfx $g0, $fg0, #32, #32 - ubfx $f1, $fg1, #0, #32 - ubfx $g1, $fg1, #32, #32 - sub $f0, $f0, $bias // remove bias - sub $g0, $g0, $bias - sub $f1, $f1, $bias - sub $g1, $g1, $bias - - ret -.size __inner_loop_31_256,.-__inner_loop_31_256 - -.type __inner_loop_62_256, %function -.align 4 -__inner_loop_62_256: - mov $f0, #1 // |f0|=1 - mov $g0, #0 // |g0|=0 - mov $f1, #0 // |f1|=0 - mov $g1, #1 // |g1|=1 - -.Loop_62_256: - sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting - sub $cnt, $cnt, #1 - and @t[0], $b_lo, @t[3] - sub @t[1], $b_lo, $a_lo // |b_|-|a_| - subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov @t[0], $f0 - csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| - csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - mov @t[1], $g0 - csel $f0, $f0, $f1, hs // exchange |f0| and |f1| - csel $f1, $f1, @t[0], hs - csel $g0, $g0, $g1, hs // exchange |g0| and |g1| - csel $g1, $g1, @t[1], hs - lsr $a_lo, $a_lo, #1 - and @t[0], $f1, @t[3] - and @t[1], $g1, @t[3] - add $f1, $f1, $f1 // |f1|<<=1 - add $g1, $g1, $g1 // |g1|<<=1 - sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) - cbnz $cnt, .Loop_62_256 - - ret -.size __inner_loop_62_256,.-__inner_loop_62_256 -___ -} - -foreach(split("\n",$code)) { - s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/; - print $_,"\n"; -} -close STDOUT; diff --git a/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl b/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl deleted file mode 100755 index 24ab5452930..00000000000 --- a/crypto/blst_src/asm/ct_inverse_mod_256-x86_64.pl +++ /dev/null @@ -1,837 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Both constant-time and fast Euclidean inversion as suggested in -# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake. -# -# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, -# const vec256 modx); -# -$python_ref.=<<'___'; -def ct_inverse_mod_256(inp, mod): - a, u = inp, 1 - b, v = mod, 0 - - k = 31 - mask = (1 << k) - 1 - - for i in range(0, 512 // k - 1): - # __ab_approximation_31 - n = max(a.bit_length(), b.bit_length()) - if n < 64: - a_, b_ = a, b - else: - a_ = (a & mask) | ((a >> (n-k-2)) << k) - b_ = (b & mask) | ((b >> (n-k-2)) << k) - - # __inner_loop_31 - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, k): - if a_ & 1: - if a_ < b_: - a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 - a_, f0, g0 = a_-b_, f0-f1, g0-g1 - a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 - - # __smulq_256_n_shift_by_31 - a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k - if a < 0: - a, f0, g0 = -a, -f0, -g0 - if b < 0: - b, f1, g1 = -b, -f1, -g1 - - # __smulq_512x63 - u, v = u*f0 + v*g0, u*f1 + v*g1 - - if 512 % k + k: - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, 512 % k + k): - if a & 1: - if a < b: - a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 - a, f0, g0 = a-b, f0-f1, g0-g1 - a, f1, g1 = a >> 1, f1 << 1, g1 << 1 - - v = u*f1 + v*g1 - - mod <<= 512 - mod.bit_length() # align to the left - if v < 0: - v += mod - if v < 0: - v += mod - elif v == 1<<512 - v -= mod - - return v & (2**512 - 1) # to be reduced % mod -___ - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); -my @acc = map("%r$_",(8..15)); -my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); -my $cnt = "%edx"; - -$frame = 8*6+2*512; - -$code.=<<___; -.text - -.globl ct_inverse_mod_256 -.type ct_inverse_mod_256,\@function,4,"unwind" -.align 32 -ct_inverse_mod_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot - and \$-512, %rax # in the frame... - mov $out_ptr, 8*4(%rsp) - mov $nx_ptr, 8*5(%rsp) - - mov 8*0($in_ptr), @acc[0] # load input - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - - mov 8*0($n_ptr), @acc[4] # load modulus - mov 8*1($n_ptr), @acc[5] - mov 8*2($n_ptr), @acc[6] - mov 8*3($n_ptr), @acc[7] - - mov @acc[0], 8*0(%rax) # copy input to |a| - mov @acc[1], 8*1(%rax) - mov @acc[2], 8*2(%rax) - mov @acc[3], 8*3(%rax) - - mov @acc[4], 8*4(%rax) # copy modulus to |b| - mov @acc[5], 8*5(%rax) - mov @acc[6], 8*6(%rax) - mov @acc[7], 8*7(%rax) - mov %rax, $in_ptr - - ################################# first iteration - mov \$31, $cnt - call __ab_approximation_31_256 - #mov $f0, 8*0(%rsp) - #mov $g0, 8*1(%rsp) - mov $f1, 8*2(%rsp) - mov $g1, 8*3(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call __smulq_256_n_shift_by_31 - #mov $f0, 8*0(%rsp) # corrected |f0| - #mov $g0, 8*1(%rsp) # corrected |g0| - mov $f0, 8*8($out_ptr) # initialize |u| with |f0| - - mov 8*2(%rsp), $f0 # |f1| - mov 8*3(%rsp), $g0 # |g1| - lea 8*4($out_ptr), $out_ptr # pointer to destination |b| - call __smulq_256_n_shift_by_31 - #mov $f0, 8*2(%rsp) # corrected |f1| - #mov $g0, 8*3(%rsp) # corrected |g1| - mov $f0, 8*9($out_ptr) # initialize |v| with |f1| - - ################################# second iteration - xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$31, $cnt - call __ab_approximation_31_256 - #mov $f0, 8*0(%rsp) - #mov $g0, 8*1(%rsp) - mov $f1, 8*2(%rsp) - mov $g1, 8*3(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call __smulq_256_n_shift_by_31 - mov $f0, 8*0(%rsp) # corrected |f0| - mov $g0, 8*1(%rsp) # corrected |g0| - - mov 8*2(%rsp), $f0 # |f1| - mov 8*3(%rsp), $g0 # |g1| - lea 8*4($out_ptr), $out_ptr # pointer to destination |b| - call __smulq_256_n_shift_by_31 - #mov $f0, 8*2(%rsp) # corrected |f1| - #mov $g0, 8*3(%rsp) # corrected |g1| - - mov 8*8($in_ptr), @acc[0] # |u| - mov 8*13($in_ptr), @acc[4] # |v| - mov @acc[0], @acc[1] - imulq 8*0(%rsp), @acc[0] # |u|*|f0| - mov @acc[4], @acc[5] - imulq 8*1(%rsp), @acc[4] # |v|*|g0| - add @acc[4], @acc[0] - mov @acc[0], 8*4($out_ptr) # destination |u| - sar \$63, @acc[0] # sign extension - mov @acc[0], 8*5($out_ptr) - mov @acc[0], 8*6($out_ptr) - mov @acc[0], 8*7($out_ptr) - mov @acc[0], 8*8($out_ptr) - lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor - - imulq $f0, @acc[1] # |u|*|f1| - imulq $g0, @acc[5] # |v|*|g1| - add @acc[5], @acc[1] - mov @acc[1], 8*9($out_ptr) # destination |v| - sar \$63, @acc[1] # sign extension - mov @acc[1], 8*10($out_ptr) - mov @acc[1], 8*11($out_ptr) - mov @acc[1], 8*12($out_ptr) - mov @acc[1], 8*13($out_ptr) -___ -for($i=2; $i<15; $i++) { -my $smul_512x63 = $i>8 ? "__smulq_512x63" - : "__smulq_256x63"; -$code.=<<___; - xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$31, $cnt - call __ab_approximation_31_256 - #mov $f0, 8*0(%rsp) - #mov $g0, 8*1(%rsp) - mov $f1, 8*2(%rsp) - mov $g1, 8*3(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call __smulq_256_n_shift_by_31 - mov $f0, 8*0(%rsp) # corrected |f0| - mov $g0, 8*1(%rsp) # corrected |g0| - - mov 8*2(%rsp), $f0 # |f1| - mov 8*3(%rsp), $g0 # |g1| - lea 8*4($out_ptr), $out_ptr # pointer to destination |b| - call __smulq_256_n_shift_by_31 - mov $f0, 8*2(%rsp) # corrected |f1| - mov $g0, 8*3(%rsp) # corrected |g1| - - mov 8*0(%rsp), $f0 # |f0| - mov 8*1(%rsp), $g0 # |g0| - lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| - lea 8*4($out_ptr), $out_ptr # pointer to destination |u| - call __smulq_256x63 - - mov 8*2(%rsp), $f0 # |f1| - mov 8*3(%rsp), $g0 # |g1| - lea 8*5($out_ptr),$out_ptr # pointer to destination |v| - call $smul_512x63 -___ -$code.=<<___ if ($i==8); - sar \$63, %rbp # sign extension - mov %rbp, 8*5($out_ptr) - mov %rbp, 8*6($out_ptr) - mov %rbp, 8*7($out_ptr) -___ -} -$code.=<<___; - ################################# two[!] last iterations in one go - xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$47, $cnt # 31 + 512 % 31 - #call __ab_approximation_31 # |a| and |b| are exact, just load - mov 8*0($in_ptr), @acc[0] # |a_lo| - #xor @acc[1], @acc[1] # |a_hi| - mov 8*4($in_ptr), @acc[2] # |b_lo| - #xor @acc[3], @acc[3] # |b_hi| - call __inner_loop_62_256 - #mov $f0, 8*0(%rsp) - #mov $g0, 8*1(%rsp) - #mov $f1, 8*2(%rsp) - #mov $g1, 8*3(%rsp) - - #mov 8*0(%rsp), $f0 # |f0| - #mov 8*1(%rsp), $g0 # |g0| - lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| - #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| - #call __smulq_256x63 - - #mov 8*2(%rsp), $f0 # |f1| - #mov 8*3(%rsp), $g0 # |g1| - mov $f1, $f0 - mov $g1, $g0 - mov 8*4(%rsp), $out_ptr # original |out_ptr| - call __smulq_512x63 - adc %rbp, %rdx # the excess limb of the result - - mov 8*5(%rsp), $in_ptr # original |nx_ptr| - mov %rdx, %rax - sar \$63, %rdx # result's sign as mask - - mov %rdx, @acc[0] # mask |modulus| - mov %rdx, @acc[1] - and 8*0($in_ptr), @acc[0] - mov %rdx, @acc[2] - and 8*1($in_ptr), @acc[1] - and 8*2($in_ptr), @acc[2] - and 8*3($in_ptr), %rdx - - add @acc[0], @acc[4] # conditionally add |modulus|<<256 - adc @acc[1], @acc[5] - adc @acc[2], @acc[6] - adc %rdx, @acc[7] - adc \$0, %rax - - mov %rax, %rdx - neg %rax - or %rax, %rdx # excess bit or sign as mask - sar \$63, %rax # excess bit as mask - - mov %rdx, @acc[0] # mask |modulus| - mov %rdx, @acc[1] - and 8*0($in_ptr), @acc[0] - mov %rdx, @acc[2] - and 8*1($in_ptr), @acc[1] - and 8*2($in_ptr), @acc[2] - and 8*3($in_ptr), %rdx - - xor %rax, @acc[0] # conditionally negate |modulus| - xor %rcx, %rcx - xor %rax, @acc[1] - sub %rax, %rcx - xor %rax, @acc[2] - xor %rax, %rdx - add %rcx, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, %rdx - - add @acc[0], @acc[4] # final adjustment for |modulus|<<256 - adc @acc[1], @acc[5] - adc @acc[2], @acc[6] - adc %rdx, @acc[7] - - mov @acc[4], 8*4($out_ptr) # store absolute value - mov @acc[5], 8*5($out_ptr) - mov @acc[6], 8*6($out_ptr) - mov @acc[7], 8*7($out_ptr) - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size ct_inverse_mod_256,.-ct_inverse_mod_256 -___ -######################################################################## -# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers -# to the maximum bit-length of the *result*, and "63" - to the maximum -# bit-length of the |f?| and |g?| single-limb multiplicands. However! -# The latter should not be taken literally, as they are always chosen so -# that "bad things" don't happen. For example, there comes a point when -# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we -# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is -# because past that point |f0| is always 1 and |g0| is always 0. And, -# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to -# perform full-width |u|*|f1| multiplication, half-width one with sign -# extension is sufficient... -$code.=<<___; -.type __smulq_512x63,\@abi-omnipotent -.align 32 -__smulq_512x63: - mov 8*0($in_ptr), @acc[0] # load |u| - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), %rbp # sign limb - - mov $f0, %rbx - sar \$63, $f0 # |f0|'s sign as mask - xor %rax, %rax - sub $f0, %rax # |f0|'s sign as bit - - xor $f0, %rbx # conditionally negate |f0| - add %rax, %rbx - - xor $f0, @acc[0] # conditionally negate |u| - xor $f0, @acc[1] - xor $f0, @acc[2] - xor $f0, @acc[3] - xor $f0, %rbp - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, %rbp - - mulq %rbx # |u|*|f0| - mov %rax, 8*0($out_ptr) # offload |u|*|f0| - mov @acc[1], %rax - mov %rdx, @acc[1] -___ -for($i=1; $i<3; $i++) { -$code.=<<___; - mulq %rbx - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov @acc[$i], 8*$i($out_ptr) - mov %rdx, @acc[$i+1] -___ -} -$code.=<<___; - and %rbx, %rbp - neg %rbp - mulq %rbx - add %rax, @acc[3] - adc %rdx, %rbp - mov @acc[3], 8*3($out_ptr) - - mov 8*5($in_ptr), @acc[0] # load |v| - mov 8*6($in_ptr), @acc[1] - mov 8*7($in_ptr), @acc[2] - mov 8*8($in_ptr), @acc[3] - mov 8*9($in_ptr), @acc[4] - mov 8*10($in_ptr), @acc[5] - mov 8*11($in_ptr), @acc[6] - mov 8*12($in_ptr), @acc[7] - - mov $g0, $f0 - sar \$63, $f0 # |g0|'s sign as mask - xor %rax, %rax - sub $f0, %rax # |g0|'s sign as bit - - xor $f0, $g0 # conditionally negate |g0| - add %rax, $g0 - - xor $f0, @acc[0] # conditionally negate |v| - xor $f0, @acc[1] - xor $f0, @acc[2] - xor $f0, @acc[3] - xor $f0, @acc[4] - xor $f0, @acc[5] - xor $f0, @acc[6] - xor $f0, @acc[7] - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - adc \$0, @acc[6] - adc \$0, @acc[7] - - mulq $g0 - mov %rax, @acc[0] - mov @acc[1], %rax - mov %rdx, @acc[1] -___ -for($i=1; $i<7; $i++) { -$code.=<<___; - mulq $g0 - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov %rdx, @acc[$i+1] -___ -} -$code.=<<___; - imulq $g0 - add %rax, @acc[7] - adc \$0, %rdx # used in the final step - - mov %rbp, %rbx - sar \$63, %rbp # sign extension - - add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| - adc 8*1($out_ptr), @acc[1] - adc 8*2($out_ptr), @acc[2] - adc 8*3($out_ptr), @acc[3] - adc %rbx, @acc[4] - adc %rbp, @acc[5] - adc %rbp, @acc[6] - adc %rbp, @acc[7] - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) - mov @acc[6], 8*6($out_ptr) - mov @acc[7], 8*7($out_ptr) - - ret -.size __smulq_512x63,.-__smulq_512x63 - -.type __smulq_256x63,\@abi-omnipotent -.align 32 -__smulq_256x63: -___ -for($j=0; $j<2; $j++) { -my $k = 8*5*$j; -my @acc=@acc; @acc=@acc[4..7] if($j); -my $top="%rbp"; $top=$g0 if($j); -$code.=<<___; - mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) - mov $k+8*1($in_ptr), @acc[1] - mov $k+8*2($in_ptr), @acc[2] - mov $k+8*3($in_ptr), @acc[3] - mov $k+8*4($in_ptr), $top # sign/excess limb - - mov $f0, %rbx - sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) - xor %rax, %rax - sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) - - xor $f0, %rbx # conditionally negate |f0| - add %rax, %rbx - - xor $f0, @acc[0] # conditionally negate |u| (or |v|) - xor $f0, @acc[1] - xor $f0, @acc[2] - xor $f0, @acc[3] - xor $f0, $top - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, $top - - mulq %rbx - mov %rax, @acc[0] - mov @acc[1], %rax - mov %rdx, @acc[1] -___ -for($i=1; $i<3; $i++) { -$code.=<<___; - mulq %rbx - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov %rdx, @acc[$i+1] -___ -} -$code.=<<___; - and %rbx, $top - neg $top - mulq %rbx - add %rax, @acc[3] - adc %rdx, $top -___ -$code.=<<___ if ($j==0); - mov $g0, $f0 -___ -} -$code.=<<___; - add @acc[4], @acc[0] # accumulate |u|*|f0| - adc @acc[5], @acc[1] - adc @acc[6], @acc[2] - adc @acc[7], @acc[3] - adc %rcx, %rbp - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov %rbp, 8*4($out_ptr) - - ret -.size __smulq_256x63,.-__smulq_256x63 -___ -######################################################################## -# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of -# the names refers to maximum bit-lengths of |a| and |b|. As already -# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always -# chosen so that "bad things" don't happen. For example, so that the -# sum of the products doesn't overflow, and that the final result is -# never wider than inputs... -{ -$code.=<<___; -.type __smulq_256_n_shift_by_31,\@abi-omnipotent -.align 32 -__smulq_256_n_shift_by_31: - mov $f0, 8*0($out_ptr) # offload |f0| - mov $g0, 8*1($out_ptr) # offload |g0| - mov $f0, %rbp -___ -for($j=0; $j<2; $j++) { -my $k = 8*4*$j; -my @acc=@acc; @acc=@acc[4..7] if ($j); -my $f0="%rbp"; $f0=$g0 if ($j); -$code.=<<___; - mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) - mov $k+8*1($in_ptr), @acc[1] - mov $k+8*2($in_ptr), @acc[2] - mov $k+8*3($in_ptr), @acc[3] - - mov $f0, %rbx - sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) - xor %rax, %rax - sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) - - xor $f0, %rbx # conditionally negate |f0| (or |g0|) - add %rax, %rbx - - xor $f0, @acc[0] # conditionally negate |a| (or |b|) - xor $f0, @acc[1] - xor $f0, @acc[2] - xor $f0, @acc[3] - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - - mulq %rbx - mov %rax, @acc[0] - mov @acc[1], %rax - and %rbx, $f0 - neg $f0 - mov %rdx, @acc[1] -___ -for($i=1; $i<3; $i++) { -$code.=<<___; - mulq %rbx - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov %rdx, @acc[$i+1] -___ -} -$code.=<<___; - mulq %rbx - add %rax, @acc[3] - adc %rdx, $f0 -___ -} -$code.=<<___; - add @acc[4], @acc[0] - adc @acc[5], @acc[1] - adc @acc[6], @acc[2] - adc @acc[7], @acc[3] - adc $g0, %rbp - - mov 8*0($out_ptr), $f0 # restore original |f0| - mov 8*1($out_ptr), $g0 # restore original |g0| - - shrd \$31, @acc[1], @acc[0] - shrd \$31, @acc[2], @acc[1] - shrd \$31, @acc[3], @acc[2] - shrd \$31, %rbp, @acc[3] - - sar \$63, %rbp # sign as mask - xor %rax, %rax - sub %rbp, %rax # sign as bit - - xor %rbp, @acc[0] # conditionally negate the result - xor %rbp, @acc[1] - xor %rbp, @acc[2] - xor %rbp, @acc[3] - add %rax, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - - xor %rbp, $f0 # conditionally negate |f0| - xor %rbp, $g0 # conditionally negate |g0| - add %rax, $f0 - add %rax, $g0 - - ret -.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 -___ -} - -{ -my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); -my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); -my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); -my ($a_, $b_) = ($a_lo, $b_lo); -{ -my @a = ($a_lo, $t1, $a_hi); -my @b = ($b_lo, $t2, $b_hi); - -$code.=<<___; -.type __ab_approximation_31_256,\@abi-omnipotent -.align 32 -__ab_approximation_31_256: - mov 8*3($in_ptr), @a[2] # load |a| in reverse order - mov 8*7($in_ptr), @b[2] # load |b| in reverse order - mov 8*2($in_ptr), @a[1] - mov 8*6($in_ptr), @b[1] - mov 8*1($in_ptr), @a[0] - mov 8*5($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # check top-most limbs, ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - mov 8*0($in_ptr), @a[0] - cmovz @b[0], @b[1] - mov 8*4($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # ... and ones before that ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - cmovz @b[0], @b[1] - - mov @a[2], $t0 - or @b[2], $t0 - bsr $t0, %rcx - lea 1(%rcx), %rcx - cmovz @a[0], @a[2] - cmovz @b[0], @b[2] - cmovz $t0, %rcx - neg %rcx - #and \$63, %rcx # debugging artefact - - shldq %cl, @a[1], @a[2] # align second limb to the left - shldq %cl, @b[1], @b[2] - - mov \$0x7FFFFFFF, %eax - and %rax, @a[0] - and %rax, @b[0] - not %rax - and %rax, @a[2] - and %rax, @b[2] - or @a[2], @a[0] - or @b[2], @b[0] - - jmp __inner_loop_31_256 - - ret -.size __ab_approximation_31_256,.-__ab_approximation_31_256 -___ -} -$code.=<<___; -.type __inner_loop_31_256,\@abi-omnipotent -.align 32 # comment and punish Coffee Lake by up to 40% -__inner_loop_31_256: ################# by Thomas Pornin - mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 - mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 - mov \$0x7FFFFFFF7FFFFFFF, $bias - -.Loop_31_256: - cmp $b_, $a_ # if |a_|<|b_|, swap the variables - mov $a_, $t0 - mov $b_, $t1 - mov $fg0, $t2 - mov $fg1, $t3 - cmovb $b_, $a_ - cmovb $t0, $b_ - cmovb $fg1, $fg0 - cmovb $t2, $fg1 - - sub $b_, $a_ # |a_|-|b_| - sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| - add $bias, $fg0 - - test \$1, $t0 # if |a_| was even, roll back - cmovz $t0, $a_ - cmovz $t1, $b_ - cmovz $t2, $fg0 - cmovz $t3, $fg1 - - shr \$1, $a_ # |a_|>>=1 - add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 - sub $bias, $fg1 - sub \$1, $cnt - jnz .Loop_31_256 - - shr \$32, $bias - mov %ecx, %edx # $fg0, $f0 - mov ${fg1}d, ${f1}d - shr \$32, $g0 - shr \$32, $g1 - sub $bias, $f0 # remove the bias - sub $bias, $g0 - sub $bias, $f1 - sub $bias, $g1 - - ret -.size __inner_loop_31_256,.-__inner_loop_31_256 - -.type __inner_loop_62_256,\@abi-omnipotent -.align 32 -__inner_loop_62_256: - mov $cnt, %r15d - mov \$1, $f0 # |f0|=1 - xor $g0, $g0 # |g0|=0 - xor $f1, $f1 # |f1|=0 - mov $f0, $g1 # |g1|=1 - mov $f0, %r14 - -.Loop_62_256: - xor $t0, $t0 - test %r14, $a_lo # if |a_| is odd, then we'll be subtracting |b_| - mov $b_lo, $t1 - cmovnz $b_lo, $t0 - sub $a_lo, $t1 # |b_|-|a_| - mov $a_lo, $t2 - sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) - cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| - cmovc $t2, $b_lo # |b_| = |a_| - mov $f0, $t0 # exchange |f0| and |f1| - cmovc $f1, $f0 - cmovc $t0, $f1 - mov $g0, $t1 # exchange |g0| and |g1| - cmovc $g1, $g0 - cmovc $t1, $g1 - xor $t0, $t0 - xor $t1, $t1 - shr \$1, $a_lo - test %r14, $t2 # if |a_| was odd, then we'll be subtracting... - cmovnz $f1, $t0 - cmovnz $g1, $t1 - add $f1, $f1 # |f1|<<=1 - add $g1, $g1 # |g1|<<=1 - sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) - sub \$1, %r15d - jnz .Loop_62_256 - - ret -.size __inner_loop_62_256,.-__inner_loop_62_256 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl b/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl deleted file mode 100755 index 268bf9d2546..00000000000 --- a/crypto/blst_src/asm/ct_inverse_mod_384-armv8.pl +++ /dev/null @@ -1,610 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Both constant-time and fast Euclidean inversion as suggested in -# https://eprint.iacr.org/2020/972. Performance is >12x better [on -# Cortex cores] than modulus-specific FLT addition chain... -# -# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); -# -$python_ref.=<<'___'; -def ct_inverse_mod_383(inp, mod): - a, u = inp, 1 - b, v = mod, 0 - - k = 62 - w = 64 - mask = (1 << w) - 1 - - for i in range(0, 766 // k): - # __ab_approximation_62 - n = max(a.bit_length(), b.bit_length()) - if n < 128: - a_, b_ = a, b - else: - a_ = (a & mask) | ((a >> (n-w)) << w) - b_ = (b & mask) | ((b >> (n-w)) << w) - - # __inner_loop_62 - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, k): - if a_ & 1: - if a_ < b_: - a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 - a_, f0, g0 = a_-b_, f0-f1, g0-g1 - a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 - - # __smul_383_n_shift_by_62 - a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k - if a < 0: - a, f0, g0 = -a, -f0, -g0 - if b < 0: - b, f1, g1 = -b, -f1, -g1 - - # __smul_767x63 - u, v = u*f0 + v*g0, u*f1 + v*g1 - - if 766 % k: - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, 766 % k): - if a & 1: - if a < b: - a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 - a, f0, g0 = a-b, f0-f1, g0-g1 - a, f1, g1 = a >> 1, f1 << 1, g1 << 1 - - v = u*f1 + v*g1 - - if v < 0: - v += mod << (768 - mod.bit_length()) # left aligned - - return v & (2**768 - 1) # to be reduced % mod -___ - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); -my @acc=map("x$_",(3..14)); -my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21)); -my $cnt = $n_ptr; -my @t = map("x$_",(22..28,2)); -my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11]; - -$frame = 16+2*512; - -$code.=<<___; -.text - -.globl ct_inverse_mod_383 -.type ct_inverse_mod_383, %function -.align 5 -ct_inverse_mod_383: - paciasp - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #$frame - - ldp @t[0], @acc[1], [$in_ptr,#8*0] - ldp @acc[2], @acc[3], [$in_ptr,#8*2] - ldp @acc[4], @acc[5], [$in_ptr,#8*4] - - add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot - and $in_ptr, $in_ptr, #-512 // in the frame... - stp $out_ptr, $nx_ptr, [sp] - - ldp @acc[6], @acc[7], [$n_ptr,#8*0] - ldp @acc[8], @acc[9], [$n_ptr,#8*2] - ldp @acc[10], @acc[11], [$n_ptr,#8*4] - - stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| - stp @acc[2], @acc[3], [$in_ptr,#8*2] - stp @acc[4], @acc[5], [$in_ptr,#8*4] - stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b| - stp @acc[8], @acc[9], [$in_ptr,#8*8] - stp @acc[10], @acc[11], [$in_ptr,#8*10] - - ////////////////////////////////////////// first iteration - mov $cnt, #62 - bl .Lab_approximation_62_loaded - - eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - str $f0,[$out_ptr,#8*12] // initialize |u| with |f0| - - mov $f0, $f1 // |f1| - mov $g0, $g1 // |g1| - add $out_ptr, $out_ptr, #8*6 // pointer to dst |b| - bl __smul_383_n_shift_by_62 - str $f0, [$out_ptr,#8*12] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| - mov $cnt, #62 - bl __ab_approximation_62 - - eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov $f_, $f0 // corrected |f0| - mov $g_, $g0 // corrected |g0| - - mov $f0, $f1 // |f1| - mov $g0, $g1 // |g1| - add $out_ptr, $out_ptr, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - ldr @acc[4], [$in_ptr,#8*12] // |u| - ldr @acc[5], [$in_ptr,#8*18] // |v| - mul @acc[0], $f_, @acc[4] // |u|*|f0| - smulh @acc[1], $f_, @acc[4] - mul @acc[2], $g_, @acc[5] // |v|*|g0| - smulh @acc[3], $g_, @acc[5] - adds @acc[0], @acc[0], @acc[2] - adc @acc[1], @acc[1], @acc[3] - stp @acc[0], @acc[1], [$out_ptr,#8*6] - asr @acc[2], @acc[1], #63 // sign extenstion - stp @acc[2], @acc[2], [$out_ptr,#8*8] - stp @acc[2], @acc[2], [$out_ptr,#8*10] - - mul @acc[0], $f0, @acc[4] // |u|*|f1| - smulh @acc[1], $f0, @acc[4] - mul @acc[2], $g0, @acc[5] // |v|*|g1| - smulh @acc[3], $g0, @acc[5] - adds @acc[0], @acc[0], @acc[2] - adc @acc[1], @acc[1], @acc[3] - stp @acc[0], @acc[1], [$out_ptr,#8*12] - asr @acc[2], @acc[1], #63 // sign extenstion - stp @acc[2], @acc[2], [$out_ptr,#8*14] - stp @acc[2], @acc[2], [$out_ptr,#8*16] -___ -for($i=2; $i<11; $i++) { -$code.=<<___; - eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| - mov $cnt, #62 - bl __ab_approximation_62 - - eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov $f_, $f0 // corrected |f0| - mov $g_, $g0 // corrected |g0| - - mov $f0, $f1 // |f1| - mov $g0, $g1 // |g1| - add $out_ptr, $out_ptr, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add $out_ptr, $out_ptr, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov $f_, $f0 // corrected |f1| - mov $g_, $g0 // corrected |g1| - add $out_ptr, $out_ptr, #8*6 // pointer to destination |v| - bl __smul_383x63 -___ -$code.=<<___ if ($i>5); - bl __smul_767x63_tail -___ -$code.=<<___ if ($i==5); - asr @t[5], @t[5], #63 // sign extension - stp @t[5], @t[5], [$out_ptr,#8*6] - stp @t[5], @t[5], [$out_ptr,#8*8] - stp @t[5], @t[5], [$out_ptr,#8*10] -___ -} -$code.=<<___; - ////////////////////////////////////////// iteration before last - eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| - mov $cnt, #62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load - ldp $b_lo, $b_hi, [$in_ptr,#8*6] - bl __inner_loop_62 - - eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| - str $a_lo, [$out_ptr,#8*0] - str $b_lo, [$out_ptr,#8*6] - - mov $f_, $f0 // exact |f0| - mov $g_, $g0 // exact |g0| - mov $f0, $f1 - mov $g0, $g1 - add $out_ptr, $out_ptr, #8*12 // pointer to dst |u| - bl __smul_383x63 - - mov $f_, $f0 // exact |f1| - mov $g_, $g0 // exact |g1| - add $out_ptr, $out_ptr, #8*6 // pointer to dst |v| - bl __smul_383x63 - bl __smul_767x63_tail - - ////////////////////////////////////////// last iteration - eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| - mov $cnt, #22 // 766 % 62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldr $a_lo, [$in_ptr,#8*0] // just load - eor $a_hi, $a_hi, $a_hi - ldr $b_lo, [$in_ptr,#8*6] - eor $b_hi, $b_hi, $b_hi - bl __inner_loop_62 - - mov $f_, $f1 - mov $g_, $g1 - ldp $out_ptr, $f0, [sp] // original out_ptr and n_ptr - bl __smul_383x63 - bl __smul_767x63_tail - ldr x30, [x29,#8] - - asr @t[0], @acc[5], #63 // sign as mask - ldp @acc[6], @acc[7], [$f0,#8*0] - ldp @acc[8], @acc[9], [$f0,#8*2] - ldp @acc[10], @acc[11], [$f0,#8*4] - - and @acc[6], @acc[6], @t[0] // add mod<<384 conditionally - and @acc[7], @acc[7], @t[0] - adds @acc[0], @acc[0], @acc[6] - and @acc[8], @acc[8], @t[0] - adcs @acc[1], @acc[1], @acc[7] - and @acc[9], @acc[9], @t[0] - adcs @acc[2], @acc[2], @acc[8] - and @acc[10], @acc[10], @t[0] - adcs @acc[3], @acc[3], @acc[9] - and @acc[11], @acc[11], @t[0] - stp @acc[0], @acc[1], [$out_ptr,#8*6] - adcs @acc[4], @acc[4], @acc[10] - stp @acc[2], @acc[3], [$out_ptr,#8*8] - adc @acc[5], @acc[5], @acc[11] - stp @acc[4], @acc[5], [$out_ptr,#8*10] - - add sp, sp, #$frame - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 - autiasp - ret -.size ct_inverse_mod_383,.-ct_inverse_mod_383 - -//////////////////////////////////////////////////////////////////////// -// see corresponding commentary in ctx_inverse_mod_384-x86_64... -.type __smul_383x63, %function -.align 5 -__smul_383x63: -___ -for($j=0; $j<2; $j++) { -my $f_ = $f_; $f_ = $g_ if ($j); -my @acc = @acc; @acc = @acc[6..11] if ($j); -my $k = 8*12+8*6*$j; -$code.=<<___; - ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) - asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) - ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] - eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) - ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] - - eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) - sub $f_, $f_, $f1 - eor @acc[1], @acc[1], $f1 - adds @acc[0], @acc[0], $f1, lsr#63 - eor @acc[2], @acc[2], $f1 - adcs @acc[1], @acc[1], xzr - eor @acc[3], @acc[3], $f1 - adcs @acc[2], @acc[2], xzr - eor @acc[4], @acc[4], $f1 - adcs @acc[3], @acc[3], xzr - umulh @t[0], @acc[0], $f_ - eor @acc[5], @acc[5], $f1 - umulh @t[1], @acc[1], $f_ - adcs @acc[4], @acc[4], xzr - umulh @t[2], @acc[2], $f_ - adcs @acc[5], @acc[5], xzr - umulh @t[3], @acc[3], $f_ -___ -$code.=<<___ if ($j); - adc $g1, xzr, xzr // used in __smul_767x63_tail -___ -$code.=<<___; - umulh @t[4], @acc[4], $f_ - mul @acc[0], @acc[0], $f_ - mul @acc[1], @acc[1], $f_ - mul @acc[2], @acc[2], $f_ - adds @acc[1], @acc[1], @t[0] - mul @acc[3], @acc[3], $f_ - adcs @acc[2], @acc[2], @t[1] - mul @acc[4], @acc[4], $f_ - adcs @acc[3], @acc[3], @t[2] - mul @t[5+$j],@acc[5], $f_ - adcs @acc[4], @acc[4], @t[3] - adcs @t[5+$j],@t[5+$j],@t[4] -___ -$code.=<<___ if ($j==0); - adc @t[7], xzr, xzr -___ -} -$code.=<<___; - adc @t[7], @t[7], xzr - - adds @acc[0], @acc[0], @acc[6] - adcs @acc[1], @acc[1], @acc[7] - adcs @acc[2], @acc[2], @acc[8] - adcs @acc[3], @acc[3], @acc[9] - stp @acc[0], @acc[1], [$out_ptr,#8*0] - adcs @acc[4], @acc[4], @acc[10] - stp @acc[2], @acc[3], [$out_ptr,#8*2] - adcs @t[5], @t[5], @t[6] - stp @acc[4], @t[5], [$out_ptr,#8*4] - adc @t[6], @t[7], xzr // used in __smul_767x63_tail - - ret -.size __smul_383x63,.-__smul_383x63 - -.type __smul_767x63_tail, %function -.align 5 -__smul_767x63_tail: - smulh @t[5], @acc[5], $f_ - ldp @acc[0], @acc[1], [$in_ptr,#8*24] // load rest of |v| - umulh @acc[11],@acc[11], $g_ - ldp @acc[2], @acc[3], [$in_ptr,#8*26] - ldp @acc[4], @acc[5], [$in_ptr,#8*28] - - eor @acc[0], @acc[0], $f1 // conditionally negate rest of |v| - eor @acc[1], @acc[1], $f1 - eor @acc[2], @acc[2], $f1 - adds @acc[0], @acc[0], $g1 - eor @acc[3], @acc[3], $f1 - adcs @acc[1], @acc[1], xzr - eor @acc[4], @acc[4], $f1 - adcs @acc[2], @acc[2], xzr - eor @acc[5], @acc[5], $f1 - adcs @acc[3], @acc[3], xzr - umulh @t[0], @acc[0], $g_ - adcs @acc[4], @acc[4], xzr - umulh @t[1], @acc[1], $g_ - adc @acc[5], @acc[5], xzr - - umulh @t[2], @acc[2], $g_ - add @acc[11], @acc[11], @t[6] - umulh @t[3], @acc[3], $g_ - asr @t[6], @t[5], #63 - umulh @t[4], @acc[4], $g_ - mul @acc[0], @acc[0], $g_ - mul @acc[1], @acc[1], $g_ - mul @acc[2], @acc[2], $g_ - adds @acc[0], @acc[0], @acc[11] - mul @acc[3], @acc[3], $g_ - adcs @acc[1], @acc[1], @t[0] - mul @acc[4], @acc[4], $g_ - adcs @acc[2], @acc[2], @t[1] - mul @acc[5], @acc[5], $g_ - adcs @acc[3], @acc[3], @t[2] - adcs @acc[4], @acc[4], @t[3] - adc @acc[5], @acc[5], @t[4] - - adds @acc[0], @acc[0], @t[5] - adcs @acc[1], @acc[1], @t[6] - adcs @acc[2], @acc[2], @t[6] - adcs @acc[3], @acc[3], @t[6] - stp @acc[0], @acc[1], [$out_ptr,#8*6] - adcs @acc[4], @acc[4], @t[6] - stp @acc[2], @acc[3], [$out_ptr,#8*8] - adc @acc[5], @acc[5], @t[6] - stp @acc[4], @acc[5], [$out_ptr,#8*10] - - ret -.size __smul_767x63_tail,.-__smul_767x63_tail - -.type __smul_383_n_shift_by_62, %function -.align 5 -__smul_383_n_shift_by_62: -___ -for($j=0; $j<2; $j++) { -my $f0 = $f0; $f0 = $g0 if ($j); -my @acc = @acc; @acc = @acc[6..11] if ($j); -my $k = 8*6*$j; -$code.=<<___; - ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) - asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s) - ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] - eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|) - ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] - - eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|) - sub @t[7], @t[7], @t[6] - eor @acc[1], @acc[1], @t[6] - adds @acc[0], @acc[0], @t[6], lsr#63 - eor @acc[2], @acc[2], @t[6] - adcs @acc[1], @acc[1], xzr - eor @acc[3], @acc[3], @t[6] - adcs @acc[2], @acc[2], xzr - eor @acc[4], @acc[4], @t[6] - umulh @t[0], @acc[0], @t[7] - adcs @acc[3], @acc[3], xzr - umulh @t[1], @acc[1], @t[7] - eor @acc[5], @acc[5], @t[6] - umulh @t[2], @acc[2], @t[7] - adcs @acc[4], @acc[4], xzr - umulh @t[3], @acc[3], @t[7] - adc @acc[5], @acc[5], xzr - - umulh @t[4], @acc[4], @t[7] - smulh @t[5+$j], @acc[5], @t[7] - mul @acc[0], @acc[0], @t[7] - mul @acc[1], @acc[1], @t[7] - mul @acc[2], @acc[2], @t[7] - adds @acc[1], @acc[1], @t[0] - mul @acc[3], @acc[3], @t[7] - adcs @acc[2], @acc[2], @t[1] - mul @acc[4], @acc[4], @t[7] - adcs @acc[3], @acc[3], @t[2] - mul @acc[5], @acc[5], @t[7] - adcs @acc[4], @acc[4], @t[3] - adcs @acc[5], @acc[5] ,@t[4] - adc @t[5+$j], @t[5+$j], xzr -___ -} -$code.=<<___; - adds @acc[0], @acc[0], @acc[6] - adcs @acc[1], @acc[1], @acc[7] - adcs @acc[2], @acc[2], @acc[8] - adcs @acc[3], @acc[3], @acc[9] - adcs @acc[4], @acc[4], @acc[10] - adcs @acc[5], @acc[5], @acc[11] - adc @acc[6], @t[5], @t[6] - - extr @acc[0], @acc[1], @acc[0], #62 - extr @acc[1], @acc[2], @acc[1], #62 - extr @acc[2], @acc[3], @acc[2], #62 - asr @t[6], @acc[6], #63 - extr @acc[3], @acc[4], @acc[3], #62 - extr @acc[4], @acc[5], @acc[4], #62 - extr @acc[5], @acc[6], @acc[5], #62 - - eor @acc[0], @acc[0], @t[6] - eor @acc[1], @acc[1], @t[6] - adds @acc[0], @acc[0], @t[6], lsr#63 - eor @acc[2], @acc[2], @t[6] - adcs @acc[1], @acc[1], xzr - eor @acc[3], @acc[3], @t[6] - adcs @acc[2], @acc[2], xzr - eor @acc[4], @acc[4], @t[6] - adcs @acc[3], @acc[3], xzr - eor @acc[5], @acc[5], @t[6] - stp @acc[0], @acc[1], [$out_ptr,#8*0] - adcs @acc[4], @acc[4], xzr - stp @acc[2], @acc[3], [$out_ptr,#8*2] - adc @acc[5], @acc[5], xzr - stp @acc[4], @acc[5], [$out_ptr,#8*4] - - eor $f0, $f0, @t[6] - eor $g0, $g0, @t[6] - sub $f0, $f0, @t[6] - sub $g0, $g0, @t[6] - - ret -.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 -___ - -{ -my @a = @acc[0..5]; -my @b = @acc[6..11]; - -$code.=<<___; -.type __ab_approximation_62, %function -.align 4 -__ab_approximation_62: - ldp @a[4], @a[5], [$in_ptr,#8*4] - ldp @b[4], @b[5], [$in_ptr,#8*10] - ldp @a[2], @a[3], [$in_ptr,#8*2] - ldp @b[2], @b[3], [$in_ptr,#8*8] - -.Lab_approximation_62_loaded: - orr @t[0], @a[5], @b[5] // check top-most limbs, ... - cmp @t[0], #0 - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - csel @a[4], @a[4], @a[3], ne - orr @t[0], @a[5], @b[5] // ... ones before top-most, ... - csel @b[4], @b[4], @b[3], ne - - ldp @a[0], @a[1], [$in_ptr,#8*0] - ldp @b[0], @b[1], [$in_ptr,#8*6] - - cmp @t[0], #0 - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - csel @a[4], @a[4], @a[2], ne - orr @t[0], @a[5], @b[5] // ... and ones before that ... - csel @b[4], @b[4], @b[2], ne - - cmp @t[0], #0 - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - csel @a[4], @a[4], @a[1], ne - orr @t[0], @a[5], @b[5] - csel @b[4], @b[4], @b[1], ne - - clz @t[0], @t[0] - cmp @t[0], #64 - csel @t[0], @t[0], xzr, ne - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - neg @t[1], @t[0] - - lslv @a[5], @a[5], @t[0] // align high limbs to the left - lslv @b[5], @b[5], @t[0] - lsrv @a[4], @a[4], @t[1] - lsrv @b[4], @b[4], @t[1] - and @a[4], @a[4], @t[1], asr#6 - and @b[4], @b[4], @t[1], asr#6 - orr @a[5], @a[5], @a[4] - orr @b[5], @b[5], @b[4] - - b __inner_loop_62 - ret -.size __ab_approximation_62,.-__ab_approximation_62 -___ -} -$code.=<<___; -.type __inner_loop_62, %function -.align 4 -__inner_loop_62: - mov $f0, #1 // |f0|=1 - mov $g0, #0 // |g0|=0 - mov $f1, #0 // |f1|=0 - mov $g1, #1 // |g1|=1 - -.Loop_62: - sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting - sub $cnt, $cnt, #1 - subs @t[2], $b_lo, $a_lo // |b_|-|a_| - and @t[0], $b_lo, @t[6] - sbc @t[3], $b_hi, $a_hi - and @t[1], $b_hi, @t[6] - subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov @t[0], $f0 - sbcs @t[5], $a_hi, @t[1] - mov @t[1], $g0 - csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| - csel $b_hi, $b_hi, $a_hi, hs - csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel $a_hi, @t[5], @t[3], hs - csel $f0, $f0, $f1, hs // exchange |f0| and |f1| - csel $f1, $f1, @t[0], hs - csel $g0, $g0, $g1, hs // exchange |g0| and |g1| - csel $g1, $g1, @t[1], hs - extr $a_lo, $a_hi, $a_lo, #1 - lsr $a_hi, $a_hi, #1 - and @t[0], $f1, @t[6] - and @t[1], $g1, @t[6] - add $f1, $f1, $f1 // |f1|<<=1 - add $g1, $g1, $g1 // |g1|<<=1 - sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) - cbnz $cnt, .Loop_62 - - ret -.size __inner_loop_62,.-__inner_loop_62 -___ - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl b/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl deleted file mode 100755 index 4128dc3236d..00000000000 --- a/crypto/blst_src/asm/ct_is_square_mod_384-armv8.pl +++ /dev/null @@ -1,401 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Both constant-time and fast quadratic residue test as suggested in -# https://eprint.iacr.org/2020/972. Performance is >12x better [on -# Cortex cores] than modulus-specific Legendre symbol addition chain... -# -# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); -# -$python_ref.=<<'___'; -def ct_is_square_mod_384(inp, mod): - a = inp - b = mod - L = 0 # only least significant bit, adding 1 makes up for sign change - - k = 30 - w = 32 - mask = (1 << w) - 1 - - for i in range(0, 768 // k - 1): - # __ab_approximation_30 - n = max(a.bit_length(), b.bit_length()) - if n < 64: - a_, b_ = a, b - else: - a_ = (a & mask) | ((a >> (n-w)) << w) - b_ = (b & mask) | ((b >> (n-w)) << w) - - # __inner_loop_30 - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, k): - if a_ & 1: - if a_ < b_: - a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 - L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits - # tell the whole story - a_, f0, g0 = a_-b_, f0-f1, g0-g1 - a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 - L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] - - # __smulq_384_n_shift_by_30 - a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k - if b < 0: - b = -b - if a < 0: - a = -a - L += (b % 4) >> 1 # |b| is always odd, the second bit - # tells the whole story - - if True: - for j in range(0, 768 % k + k): - if a & 1: - if a < b: - a, b = b, a - L += (a & b) >> 1 # |a| and |b| are both odd, second bits - # tell the whole story - a = a-b - a = a >> 1 - L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] - - return (L & 1) ^ 1 -___ - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2)); -my @acc=map("x$_",(3..14)); -my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20)); -my @t = map("x$_",(21..28)); -my ($a_, $b_) = @acc[5,11]; - -$frame = 2*256; - -$code.=<<___; -.text - -.globl ct_is_square_mod_384 -.type ct_is_square_mod_384, %function -.align 5 -ct_is_square_mod_384: - paciasp - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #$frame - - ldp @acc[0], @acc[1], [x0,#8*0] // load input - ldp @acc[2], @acc[3], [x0,#8*2] - ldp @acc[4], @acc[5], [x0,#8*4] - - add $in_ptr, sp, #255 // find closest 256-byte-aligned spot - and $in_ptr, $in_ptr, #-256 // in the frame... - - ldp @acc[6], @acc[7], [x1,#8*0] // load modulus - ldp @acc[8], @acc[9], [x1,#8*2] - ldp @acc[10], @acc[11], [x1,#8*4] - - stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a| - stp @acc[2], @acc[3], [$in_ptr,#8*8] - stp @acc[4], @acc[5], [$in_ptr,#8*10] - stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b| - stp @acc[8], @acc[9], [$in_ptr,#8*2] - stp @acc[10], @acc[11], [$in_ptr,#8*4] - - eor $L, $L, $L // init the Legendre symbol - mov $cnt, #24 // 24 is 768/30-1 - b .Loop_is_square - -.align 4 -.Loop_is_square: - bl __ab_approximation_30 - sub $cnt, $cnt, #1 - - eor $out_ptr, $in_ptr, #128 // pointer to dst |b| - bl __smul_384_n_shift_by_30 - - mov $f1, $f0 // |f0| - mov $g1, $g0 // |g0| - add $out_ptr, $out_ptr, #8*6 // pointer to dst |a| - bl __smul_384_n_shift_by_30 - - ldp @acc[6], @acc[7], [$out_ptr,#-8*6] - eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b| - and @t[6], @t[6], @acc[6] // if |a| was negative, - add $L, $L, @t[6], lsr#1 // adjust |L| - - cbnz $cnt, .Loop_is_square - - ////////////////////////////////////////// last iteration - //bl __ab_approximation_30 // |a| and |b| are exact, - //ldr $a_, [$in_ptr,#8*6] // and loaded - //ldr $b_, [$in_ptr,#8*0] - mov $cnt, #48 // 48 is 768%30 + 30 - bl __inner_loop_48 - ldr x30, [x29,#8] - - and x0, $L, #1 - eor x0, x0, #1 - - add sp, sp, #$frame - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 - autiasp - ret -.size ct_is_square_mod_384,.-ct_is_square_mod_384 - -.type __smul_384_n_shift_by_30, %function -.align 5 -__smul_384_n_shift_by_30: -___ -for($j=0; $j<2; $j++) { -my $fx = $g1; $fx = $f1 if ($j); -my @acc = @acc; @acc = @acc[6..11] if ($j); -my $k = 8*6*$j; -$code.=<<___; - ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|) - asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s) - ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] - eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|) - ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] - - eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|) - sub $fx, $fx, @t[6] - eor @acc[1], @acc[1], @t[6] - adds @acc[0], @acc[0], @t[6], lsr#63 - eor @acc[2], @acc[2], @t[6] - adcs @acc[1], @acc[1], xzr - eor @acc[3], @acc[3], @t[6] - adcs @acc[2], @acc[2], xzr - eor @acc[4], @acc[4], @t[6] - umulh @t[0], @acc[0], $fx - adcs @acc[3], @acc[3], xzr - umulh @t[1], @acc[1], $fx - eor @acc[5], @acc[5], @t[6] - umulh @t[2], @acc[2], $fx - adcs @acc[4], @acc[4], xzr - umulh @t[3], @acc[3], $fx - adc @acc[5], @acc[5], xzr - - umulh @t[4], @acc[4], $fx - and @t[7], $fx, @t[6] - umulh @t[5+$j], @acc[5], $fx - neg @t[7], @t[7] - mul @acc[0], @acc[0], $fx - mul @acc[1], @acc[1], $fx - mul @acc[2], @acc[2], $fx - adds @acc[1], @acc[1], @t[0] - mul @acc[3], @acc[3], $fx - adcs @acc[2], @acc[2], @t[1] - mul @acc[4], @acc[4], $fx - adcs @acc[3], @acc[3], @t[2] - mul @acc[5], @acc[5], $fx - adcs @acc[4], @acc[4], @t[3] - adcs @acc[5], @acc[5] ,@t[4] - adc @t[5+$j], @t[5+$j], @t[7] -___ -} -$code.=<<___; - adds @acc[0], @acc[0], @acc[6] - adcs @acc[1], @acc[1], @acc[7] - adcs @acc[2], @acc[2], @acc[8] - adcs @acc[3], @acc[3], @acc[9] - adcs @acc[4], @acc[4], @acc[10] - adcs @acc[5], @acc[5], @acc[11] - adc @acc[6], @t[5], @t[6] - - extr @acc[0], @acc[1], @acc[0], #30 - extr @acc[1], @acc[2], @acc[1], #30 - extr @acc[2], @acc[3], @acc[2], #30 - asr @t[6], @acc[6], #63 - extr @acc[3], @acc[4], @acc[3], #30 - extr @acc[4], @acc[5], @acc[4], #30 - extr @acc[5], @acc[6], @acc[5], #30 - - eor @acc[0], @acc[0], @t[6] - eor @acc[1], @acc[1], @t[6] - adds @acc[0], @acc[0], @t[6], lsr#63 - eor @acc[2], @acc[2], @t[6] - adcs @acc[1], @acc[1], xzr - eor @acc[3], @acc[3], @t[6] - adcs @acc[2], @acc[2], xzr - eor @acc[4], @acc[4], @t[6] - adcs @acc[3], @acc[3], xzr - eor @acc[5], @acc[5], @t[6] - stp @acc[0], @acc[1], [$out_ptr,#8*0] - adcs @acc[4], @acc[4], xzr - stp @acc[2], @acc[3], [$out_ptr,#8*2] - adc @acc[5], @acc[5], xzr - stp @acc[4], @acc[5], [$out_ptr,#8*4] - - ret -.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 -___ - -{ -my @a = @acc[0..5]; -my @b = @acc[6..11]; -my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]); - -$code.=<<___; -.type __ab_approximation_30, %function -.align 4 -__ab_approximation_30: - ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers - ldp @b[2], @b[3], [$in_ptr,#8*2] - - orr @t[0], @a[5], @b[5] // check top-most limbs, ... - cmp @t[0], #0 - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - csel @a[4], @a[4], @a[3], ne - orr @t[0], @a[5], @b[5] // ... ones before top-most, ... - csel @b[4], @b[4], @b[3], ne - - cmp @t[0], #0 - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - csel @a[4], @a[4], @a[2], ne - orr @t[0], @a[5], @b[5] // ... and ones before that ... - csel @b[4], @b[4], @b[2], ne - - cmp @t[0], #0 - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - csel @a[4], @a[4], @a[1], ne - orr @t[0], @a[5], @b[5] // and one more, ... - csel @b[4], @b[4], @b[1], ne - - cmp @t[0], #0 - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - csel @a[4], @a[4], @a[0], ne - orr @t[0], @a[5], @b[5] - csel @b[4], @b[4], @b[0], ne - - clz @t[0], @t[0] - cmp @t[0], #64 - csel @t[0], @t[0], xzr, ne - csel @a[5], @a[5], @a[4], ne - csel @b[5], @b[5], @b[4], ne - neg @t[1], @t[0] - - lslv @a[5], @a[5], @t[0] // align high limbs to the left - lslv @b[5], @b[5], @t[0] - lsrv @a[4], @a[4], @t[1] - lsrv @b[4], @b[4], @t[1] - and @a[4], @a[4], @t[1], asr#6 - and @b[4], @b[4], @t[1], asr#6 - orr $a_, @a[5], @a[4] - orr $b_, @b[5], @b[4] - - bfxil $a_, @a[0], #0, #32 - bfxil $b_, @b[0], #0, #32 - - b __inner_loop_30 - ret -.size __ab_approximation_30,.-__ab_approximation_30 - -.type __inner_loop_30, %function -.align 4 -__inner_loop_30: - mov $cnt, #30 - mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov $bias,#0x7FFFFFFF7FFFFFFF - -.Loop_30: - sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting - and @t[4], $a_, $b_ - sub $cnt, $cnt, #1 - and @t[0], $b_, @t[3] - - sub @t[1], $b_, $a_ // |b_|-|a_| - subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) - add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1 - mov @t[0], $fg1 - csel $b_, $b_, $a_, hs // |b_| = |a_| - csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| - csel $fg0, $fg0, @t[0], hs - csel $L, $L, @t[4], hs - lsr $a_, $a_, #1 - and @t[0], $fg1, @t[3] - and @t[1], $bias, @t[3] - add $t[2], $b_, #2 - sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add $fg1, $fg1, $fg1 // |f1|<<=1 - add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - add $fg0, $fg0, @t[1] - sub $fg1, $fg1, $bias - - cbnz $cnt, .Loop_30 - - mov $bias, #0x7FFFFFFF - ubfx $f0, $fg0, #0, #32 - ubfx $g0, $fg0, #32, #32 - ubfx $f1, $fg1, #0, #32 - ubfx $g1, $fg1, #32, #32 - sub $f0, $f0, $bias // remove the bias - sub $g0, $g0, $bias - sub $f1, $f1, $bias - sub $g1, $g1, $bias - - ret -.size __inner_loop_30,.-__inner_loop_30 -___ -} - -{ -my ($a_, $b_) = (@acc[0], @acc[6]); -$code.=<<___; -.type __inner_loop_48, %function -.align 4 -__inner_loop_48: -.Loop_48: - sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting - and @t[4], $a_, $b_ - sub $cnt, $cnt, #1 - and @t[0], $b_, @t[3] - sub @t[1], $b_, $a_ // |b_|-|a_| - subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) - add @t[4], $L, @t[4], lsr#1 - csel $b_, $b_, $a_, hs // |b_| = |a_| - csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel $L, $L, @t[4], hs - add $t[2], $b_, #2 - lsr $a_, $a_, #1 - add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - - cbnz $cnt, .Loop_48 - - ret -.size __inner_loop_48,.-__inner_loop_48 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl b/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl deleted file mode 100755 index 40016ed70d2..00000000000 --- a/crypto/blst_src/asm/ct_is_square_mod_384-x86_64.pl +++ /dev/null @@ -1,494 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Both constant-time and fast quadratic residue test as suggested in -# https://eprint.iacr.org/2020/972. Performance is >5x better than -# modulus-specific Legendre symbol addition chain... -# -# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); -# -$python_ref.=<<'___'; -def ct_is_square_mod_384(inp, mod): - a = inp - b = mod - L = 0 # only least significant bit, adding 1 makes up for sign change - - k = 30 - w = 32 - mask = (1 << w) - 1 - - for i in range(0, 768 // k - 1): - # __ab_approximation_30 - n = max(a.bit_length(), b.bit_length()) - if n < 64: - a_, b_ = a, b - else: - a_ = (a & mask) | ((a >> (n-w)) << w) - b_ = (b & mask) | ((b >> (n-w)) << w) - - # __inner_loop_30 - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, k): - if a_ & 1: - if a_ < b_: - a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 - L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits - # tell the whole story - a_, f0, g0 = a_-b_, f0-f1, g0-g1 - a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 - L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] - - # __smulq_384_n_shift_by_30 - a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k - if b < 0: - b = -b - if a < 0: - a = -a - L += (b % 4) >> 1 # |b| is always odd, the second bit - # tells the whole story - - if True: - for j in range(0, 768 % k + k): - if a & 1: - if a < b: - a, b = b, a - L += (a & b) >> 1 # |a| and |b| are both odd, second bits - # tell the whole story - a = a-b - a = a >> 1 - L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] - - return (L & 1) ^ 1 -___ - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -my ($out_ptr, $in_ptr) = ("%rdi", "%rsi"); -my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx"); -my @acc=map("%r$_",(8..15)); -my $L = "%rbp"; - -$frame = 8*3+2*256; - -$code.=<<___; -.text - -.globl ct_is_square_mod_384 -.type ct_is_square_mod_384,\@function,2,"unwind" -.align 32 -ct_is_square_mod_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot - and \$-256, %rax # in the frame... - - mov 8*0(%rdi), @acc[0] # load input - mov 8*1(%rdi), @acc[1] - mov 8*2(%rdi), @acc[2] - mov 8*3(%rdi), @acc[3] - mov 8*4(%rdi), @acc[4] - mov 8*5(%rdi), @acc[5] - - mov 8*0(%rsi), @acc[6] # load modulus - mov 8*1(%rsi), @acc[7] - mov 8*2(%rsi), %rbx - mov 8*3(%rsi), %rcx - mov 8*4(%rsi), %rdx - mov 8*5(%rsi), %rdi - mov %rax, $in_ptr # pointer to source |a|b| - - mov @acc[0], 8*0(%rax) # copy input to |a| - mov @acc[1], 8*1(%rax) - mov @acc[2], 8*2(%rax) - mov @acc[3], 8*3(%rax) - mov @acc[4], 8*4(%rax) - mov @acc[5], 8*5(%rax) - - mov @acc[6], 8*6(%rax) # copy modulus to |b| - mov @acc[7], 8*7(%rax) - mov %rbx, 8*8(%rax) - mov %rcx, 8*9(%rax) - mov %rdx, 8*10(%rax) - mov %rdi, 8*11(%rax) - - xor $L, $L # initialize the Legendre symbol - mov \$24, %ecx # 24 is 768/30-1 - jmp .Loop_is_square - -.align 32 -.Loop_is_square: - mov %ecx, 8*2(%rsp) # offload loop counter - - call __ab_approximation_30 - mov $f0, 8*0(%rsp) # offload |f0| and |g0| - mov $g0, 8*1(%rsp) - - mov \$128+8*6, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |b| - call __smulq_384_n_shift_by_30 - - mov 8*0(%rsp), $f1 # pop |f0| and |g0| - mov 8*1(%rsp), $g1 - lea -8*6($out_ptr),$out_ptr # pointer to destination |a| - call __smulq_384_n_shift_by_30 - - mov 8*2(%rsp), %ecx # re-load loop counter - xor \$128, $in_ptr # flip-flop pointer to source |a|b| - - and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L| - shr \$1, @acc[6] - add @acc[6], $L - - sub \$1, %ecx - jnz .Loop_is_square - - ################################# last iteration - #call __ab_approximation_30 # |a| and |b| are exact, just load - #mov 8*0($in_ptr), @acc[0] # |a_| - mov 8*6($in_ptr), @acc[1] # |b_| - call __inner_loop_48 # 48 is 768%30+30 - - mov \$1, %rax - and $L, %rax - xor \$1, %rax # return value - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size ct_is_square_mod_384,.-ct_is_square_mod_384 - -.type __smulq_384_n_shift_by_30,\@abi-omnipotent -.align 32 -__smulq_384_n_shift_by_30: -___ -for($j=0; $j<2; $j++) { -$code.=<<___; - mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - - mov %rdx, %rbx # |f1| (or |g1|) - sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s) - xor %rax, %rax - sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s) - - xor %rdx, %rbx # conditionally negate |f1| (or |g1|) - add %rax, %rbx - - xor %rdx, @acc[0] # conditionally negate |a| (or |b|) - xor %rdx, @acc[1] - xor %rdx, @acc[2] - xor %rdx, @acc[3] - xor %rdx, @acc[4] - xor %rdx, @acc[5] - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - - mov %rdx, @acc[6+$j] - and %rbx, @acc[6+$j] - mulq %rbx # |a|*|f1| (or |b|*|g1|) - mov %rax, @acc[0] - mov @acc[1], %rax - mov %rdx, @acc[1] -___ -for($i=1; $i<5; $i++) { -$code.=<<___; - mulq %rbx - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov %rdx, @acc[$i+1] -___ -} -$code.=<<___; - neg @acc[6+$j] - mulq %rbx - add %rax, @acc[5] - adc %rdx, @acc[6+$j] -___ -$code.=<<___ if ($j==0); - lea 8*6($in_ptr), $in_ptr # pointer to |b| - mov $g1, %rdx - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) -___ -} -$code.=<<___; - lea -8*6($in_ptr), $in_ptr # restore original in_ptr - - add 8*0($out_ptr), @acc[0] - adc 8*1($out_ptr), @acc[1] - adc 8*2($out_ptr), @acc[2] - adc 8*3($out_ptr), @acc[3] - adc 8*4($out_ptr), @acc[4] - adc 8*5($out_ptr), @acc[5] - adc @acc[7], @acc[6] - - shrd \$30, @acc[1], @acc[0] - shrd \$30, @acc[2], @acc[1] - shrd \$30, @acc[3], @acc[2] - shrd \$30, @acc[4], @acc[3] - shrd \$30, @acc[5], @acc[4] - shrd \$30, @acc[6], @acc[5] - - sar \$63, @acc[6] # sign as mask - xor %rbx, %rbx - sub @acc[6], %rbx # sign as bit - - xor @acc[6], @acc[0] # conditionally negate the result - xor @acc[6], @acc[1] - xor @acc[6], @acc[2] - xor @acc[6], @acc[3] - xor @acc[6], @acc[4] - xor @acc[6], @acc[5] - add %rbx, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) - - ret -.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 -___ -{ -my ($a_, $b_) = @acc[0..1]; -my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15)); -my ($fg0, $fg1, $bias) = ($g0, $g1, $t5); -my $cnt = "%edi"; -{ -my @a = @acc[0..5]; -my @b = (@a[1..3], $t4, $t5, $g0); - -$code.=<<___; -.type __ab_approximation_30,\@abi-omnipotent -.align 32 -__ab_approximation_30: - mov 8*11($in_ptr), @b[5] # load |b| in reverse order - mov 8*10($in_ptr), @b[4] - mov 8*9($in_ptr), @b[3] - - mov @a[5], %rax - or @b[5], %rax # check top-most limbs, ... - cmovz @a[4], @a[5] - cmovz @b[4], @b[5] - cmovz @a[3], @a[4] - mov 8*8($in_ptr), @b[2] - cmovz @b[3], @b[4] - - mov @a[5], %rax - or @b[5], %rax # ... ones before top-most, ... - cmovz @a[4], @a[5] - cmovz @b[4], @b[5] - cmovz @a[2], @a[4] - mov 8*7($in_ptr), @b[1] - cmovz @b[2], @b[4] - - mov @a[5], %rax - or @b[5], %rax # ... and ones before that ... - cmovz @a[4], @a[5] - cmovz @b[4], @b[5] - cmovz @a[1], @a[4] - mov 8*6($in_ptr), @b[0] - cmovz @b[1], @b[4] - - mov @a[5], %rax - or @b[5], %rax # ... and ones before that ... - cmovz @a[4], @a[5] - cmovz @b[4], @b[5] - cmovz @a[0], @a[4] - cmovz @b[0], @b[4] - - mov @a[5], %rax - or @b[5], %rax - bsr %rax, %rcx - lea 1(%rcx), %rcx - cmovz @a[0], @a[5] - cmovz @b[0], @b[5] - cmovz %rax, %rcx - neg %rcx - #and \$63, %rcx # debugging artefact - - shldq %cl, @a[4], @a[5] # align second limb to the left - shldq %cl, @b[4], @b[5] - - mov \$0xFFFFFFFF00000000, %rax - mov @a[0]d, ${a_}d - mov @b[0]d, ${b_}d - and %rax, @a[5] - and %rax, @b[5] - or @a[5], ${a_} - or @b[5], ${b_} - - jmp __inner_loop_30 - - ret -.size __ab_approximation_30,.-__ab_approximation_30 -___ -} -$code.=<<___; -.type __inner_loop_30,\@abi-omnipotent -.align 32 -__inner_loop_30: ################# by Thomas Pornin - mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 - mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 - lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF - mov \$30, $cnt - -.Loop_30: - mov $a_, %rax - and $b_, %rax - shr \$1, %rax # (a_ & b_) >> 1 - - cmp $b_, $a_ # if |a_|<|b_|, swap the variables - mov $a_, $t0 - mov $b_, $t1 - lea (%rax,$L), %rax # pre-"negate" |L| - mov $fg0, $t2 - mov $fg1, $t3 - mov $L, $t4 - cmovb $b_, $a_ - cmovb $t0, $b_ - cmovb $fg1, $fg0 - cmovb $t2, $fg1 - cmovb %rax, $L - - sub $b_, $a_ # |a_|-|b_| - sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| - add $bias, $fg0 - - test \$1, $t0 # if |a_| was even, roll back - cmovz $t0, $a_ - cmovz $t1, $b_ - cmovz $t2, $fg0 - cmovz $t3, $fg1 - cmovz $t4, $L - - lea 2($b_), %rax - shr \$1, $a_ # |a_|>>=1 - shr \$2, %rax - add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 - lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5 - sub $bias, $fg1 - - sub \$1, $cnt - jnz .Loop_30 - - shr \$32, $bias - mov %ebx, %eax # $fg0 -> $f0 - shr \$32, $g0 - mov %ecx, %edx # $fg1 -> $f1 - shr \$32, $g1 - sub $bias, $f0 # remove the bias - sub $bias, $g0 - sub $bias, $f1 - sub $bias, $g1 - - ret -.size __inner_loop_30,.-__inner_loop_30 - -.type __inner_loop_48,\@abi-omnipotent -.align 32 -__inner_loop_48: - mov \$48, $cnt # 48 is 768%30+30 - -.Loop_48: - mov $a_, %rax - and $b_, %rax - shr \$1, %rax # (a_ & b_) >> 1 - - cmp $b_, $a_ # if |a_|<|b_|, swap the variables - mov $a_, $t0 - mov $b_, $t1 - lea (%rax,$L), %rax - mov $L, $t2 - cmovb $b_, $a_ - cmovb $t0, $b_ - cmovb %rax, $L - - sub $b_, $a_ # |a_|-|b_| - - test \$1, $t0 # if |a_| was even, roll back - cmovz $t0, $a_ - cmovz $t1, $b_ - cmovz $t2, $L - - lea 2($b_), %rax - shr \$1, $a_ # |a_|>>=1 - shr \$2, %rax - add %rax, $L # "negate" |L| if |b|%8 is 3 or 5 - - sub \$1, $cnt - jnz .Loop_48 - - ret -.size __inner_loop_48,.-__inner_loop_48 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl b/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl deleted file mode 100755 index 2be39d8ba8b..00000000000 --- a/crypto/blst_src/asm/ctq_inverse_mod_384-x86_64.pl +++ /dev/null @@ -1,886 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Both constant-time and fast Euclidean inversion as suggested in -# https://eprint.iacr.org/2020/972. Performance is >5x better than -# modulus-specific FLT addition chain... -# -# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); -# -$python_ref.=<<'___'; -def ct_inverse_mod_383(inp, mod): - a, u = inp, 1 - b, v = mod, 0 - - k = 62 - w = 64 - mask = (1 << w) - 1 - - for i in range(0, 766 // k): - # __ab_approximation_62 - n = max(a.bit_length(), b.bit_length()) - if n < 128: - a_, b_ = a, b - else: - a_ = (a & mask) | ((a >> (n-w)) << w) - b_ = (b & mask) | ((b >> (n-w)) << w) - - # __inner_loop_62 - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, k): - if a_ & 1: - if a_ < b_: - a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 - a_, f0, g0 = a_-b_, f0-f1, g0-g1 - a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 - - # __smulq_383_n_shift_by_62 - a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k - if a < 0: - a, f0, g0 = -a, -f0, -g0 - if b < 0: - b, f1, g1 = -b, -f1, -g1 - - # __smulq_767x63 - u, v = u*f0 + v*g0, u*f1 + v*g1 - - if 766 % k: - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, 766 % k): - if a & 1: - if a < b: - a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 - a, f0, g0 = a-b, f0-f1, g0-g1 - a, f1, g1 = a >> 1, f1 << 1, g1 << 1 - - v = u*f1 + v*g1 - - if v < 0: - v += mod << (768 - mod.bit_length()) # left aligned - - return v & (2**768 - 1) # to be reduced % mod -___ - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); -my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); -my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); -my $cnt = "%edi"; - -$frame = 8*11+2*512; - -$code.=<<___; -.text - -.globl ct_inverse_mod_383 -.type ct_inverse_mod_383,\@function,4,"unwind" -.align 32 -ct_inverse_mod_383: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot - and \$-512, %rax # in the frame... - mov $out_ptr, 8*4(%rsp) - mov $nx_ptr, 8*5(%rsp) - - mov 8*0($in_ptr), @acc[0] # load input - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - - mov 8*0($n_ptr), @acc[6] # load modulus - mov 8*1($n_ptr), @acc[7] - mov 8*2($n_ptr), @acc[8] - mov 8*3($n_ptr), @acc[9] - mov 8*4($n_ptr), @acc[10] - mov 8*5($n_ptr), @acc[11] - - mov @acc[0], 8*0(%rax) # copy input to |a| - mov @acc[1], 8*1(%rax) - mov @acc[2], 8*2(%rax) - mov @acc[3], 8*3(%rax) - mov @acc[4], 8*4(%rax) - mov @acc[5], 8*5(%rax) - - mov @acc[6], 8*6(%rax) # copy modulus to |b| - mov @acc[7], 8*7(%rax) - mov @acc[8], 8*8(%rax) - mov @acc[9], 8*9(%rax) - mov @acc[10], 8*10(%rax) - mov %rax, $in_ptr # pointer to source |a|b|1|0| - mov @acc[11], 8*11(%rax) - - ################################# first iteration - mov \$62, $cnt - call __ab_approximation_62 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - mov $f1, 8*9(%rsp) - mov $g1, 8*10(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call __smulq_383_n_shift_by_62 - #mov $f0, 8*7(%rsp) # corrected |f0| - #mov $g0, 8*8(%rsp) # corrected |g0| - mov $f0, 8*12($out_ptr) # initialize |u| with |f0| - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr), $out_ptr # pointer to destination |b| - call __smulq_383_n_shift_by_62 - #mov $f0, 8*9(%rsp) # corrected |f1| - #mov $g0, 8*10(%rsp) # corrected |g1| - mov $f0, 8*12($out_ptr) # initialize |v| with |f1| - - ################################# second iteration - xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$62, $cnt - call __ab_approximation_62 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - mov $f1, 8*9(%rsp) - mov $g1, 8*10(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call __smulq_383_n_shift_by_62 - mov $f0, 8*7(%rsp) # corrected |f0| - mov $g0, 8*8(%rsp) # corrected |g0| - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr), $out_ptr # pointer to destination |b| - call __smulq_383_n_shift_by_62 - #mov $f0, 8*9(%rsp) # corrected |f1| - #mov $g0, 8*10(%rsp) # corrected |g1| - - mov 8*12($in_ptr), %rax # |u| - mov 8*18($in_ptr), @acc[3] # |v| - mov $f0, %rbx - mov %rax, @acc[2] - imulq 8*7(%rsp) # |u|*|f0| - mov %rax, @acc[0] - mov @acc[3], %rax - mov %rdx, @acc[1] - imulq 8*8(%rsp) # |v|*|g0| - add %rax, @acc[0] - adc %rdx, @acc[1] - mov @acc[0], 8*6($out_ptr) # destination |u| - mov @acc[1], 8*7($out_ptr) - sar \$63, @acc[1] # sign extension - mov @acc[1], 8*8($out_ptr) - mov @acc[1], 8*9($out_ptr) - mov @acc[1], 8*10($out_ptr) - mov @acc[1], 8*11($out_ptr) - lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor - - mov @acc[2], %rax - imulq %rbx # |u|*|f1| - mov %rax, @acc[0] - mov @acc[3], %rax - mov %rdx, @acc[1] - imulq %rcx # |v|*|g1| - add %rax, @acc[0] - adc %rdx, @acc[1] - mov @acc[0], 8*12($out_ptr) # destination |v| - mov @acc[1], 8*13($out_ptr) - sar \$63, @acc[1] # sign extension - mov @acc[1], 8*14($out_ptr) - mov @acc[1], 8*15($out_ptr) - mov @acc[1], 8*16($out_ptr) - mov @acc[1], 8*17($out_ptr) -___ -for($i=2; $i<11; $i++) { -my $smul_767x63 = $i>5 ? "__smulq_767x63" - : "__smulq_383x63"; -$code.=<<___; - xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$62, $cnt - call __ab_approximation_62 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - mov $f1, 8*9(%rsp) - mov $g1, 8*10(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call __smulq_383_n_shift_by_62 - mov $f0, 8*7(%rsp) # corrected |f0| - mov $g0, 8*8(%rsp) # corrected |g0| - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr), $out_ptr # pointer to destination |b| - call __smulq_383_n_shift_by_62 - mov $f0, 8*9(%rsp) # corrected |f1| - mov $g0, 8*10(%rsp) # corrected |g1| - - mov 8*7(%rsp), $f0 # |f0| - mov 8*8(%rsp), $g0 # |g0| - lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| - lea 8*6($out_ptr), $out_ptr # pointer to destination |u| - call __smulq_383x63 - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr),$out_ptr # pointer to destination |v| - call $smul_767x63 -___ -$code.=<<___ if ($i==5); - sar \$63, @acc[5] # sign extension - mov @acc[5], 8*6($out_ptr) - mov @acc[5], 8*7($out_ptr) - mov @acc[5], 8*8($out_ptr) - mov @acc[5], 8*9($out_ptr) - mov @acc[5], 8*10($out_ptr) - mov @acc[5], 8*11($out_ptr) -___ -} -$code.=<<___; - ################################# iteration before last - xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$62, $cnt - #call __ab_approximation_62 # |a| and |b| are exact, just load - mov 8*0($in_ptr), @acc[0] # |a_lo| - mov 8*1($in_ptr), @acc[1] # |a_hi| - mov 8*6($in_ptr), @acc[2] # |b_lo| - mov 8*7($in_ptr), @acc[3] # |b_hi| - call __inner_loop_62 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - mov $f1, 8*9(%rsp) - mov $g1, 8*10(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - mov @acc[0], 8*0($out_ptr) - mov @acc[2], 8*6($out_ptr) - - #mov 8*7(%rsp), $f0 # |f0| - #mov 8*8(%rsp), $g0 # |g0| - lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| - lea 8*12($out_ptr),$out_ptr # pointer to destination |u| - call __smulq_383x63 - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr),$out_ptr # pointer to destination |v| - call __smulq_767x63 - - ################################# last iteration - xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$22, $cnt # 766 % 62 - #call __ab_approximation_62 # |a| and |b| are exact, just load - mov 8*0($in_ptr), @acc[0] # |a_lo| - xor @acc[1], @acc[1] # |a_hi| - mov 8*6($in_ptr), @acc[2] # |b_lo| - xor @acc[3], @acc[3] # |b_hi| - call __inner_loop_62 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - #mov $f1, 8*9(%rsp) - #mov $g1, 8*10(%rsp) - - #mov 8*7(%rsp), $f0 # |f0| - #mov 8*8(%rsp), $g0 # |g0| - lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| - #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| - #call __smulq_383x63 - - #mov 8*9(%rsp), $f0 # |f1| - #mov 8*10(%rsp), $g0 # |g1| - mov $f1, $f0 - mov $g1, $g0 - mov 8*4(%rsp), $out_ptr # original out_ptr - call __smulq_767x63 - - mov 8*5(%rsp), $in_ptr # original n_ptr - mov %rax, %rdx # top limb of the result - sar \$63, %rax # result's sign as mask - - mov %rax, @acc[0] # mask |modulus| - mov %rax, @acc[1] - mov %rax, @acc[2] - and 8*0($in_ptr), @acc[0] - and 8*1($in_ptr), @acc[1] - mov %rax, @acc[3] - and 8*2($in_ptr), @acc[2] - and 8*3($in_ptr), @acc[3] - mov %rax, @acc[4] - and 8*4($in_ptr), @acc[4] - and 8*5($in_ptr), %rax - - add @acc[0], @acc[6] # conditionally add |modulus|<<384 - adc @acc[1], @acc[7] - adc @acc[2], @acc[8] - adc @acc[3], @acc[9] - adc @acc[4], %rcx - adc %rax, %rdx - - mov @acc[6], 8*6($out_ptr) # store absolute value - mov @acc[7], 8*7($out_ptr) - mov @acc[8], 8*8($out_ptr) - mov @acc[9], 8*9($out_ptr) - mov %rcx, 8*10($out_ptr) - mov %rdx, 8*11($out_ptr) - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size ct_inverse_mod_383,.-ct_inverse_mod_383 -___ -######################################################################## -# see corresponding commentary in ctx_inverse_mod_384-x86_64... -{ -my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); -my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); -my $fx = @acc[9]; - -$code.=<<___; -.type __smulq_767x63,\@abi-omnipotent -.align 32 -__smulq_767x63: - mov 8*0($in_ptr), @acc[0] # load |u| - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - - mov $f0, $fx - sar \$63, $f0 # |f0|'s sign as mask - xor %rax, %rax - sub $f0, %rax # |f0|'s sign as bit - - mov $out_ptr, 8*1(%rsp) - mov $in_ptr, 8*2(%rsp) - lea 8*6($in_ptr), $in_ptr # pointer to |v| - - xor $f0, $fx # conditionally negate |f0| - add %rax, $fx - - xor $f0, @acc[0] # conditionally negate |u| - xor $f0, @acc[1] - xor $f0, @acc[2] - xor $f0, @acc[3] - xor $f0, @acc[4] - xor $f0, @acc[5] - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - - mulq $fx # |u|*|f0| - mov %rax, 8*0($out_ptr) # offload |u|*|f0| - mov @acc[1], %rax - mov %rdx, @acc[1] -___ -for($i=1; $i<5; $i++) { -$code.=<<___; - mulq $fx - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov %rdx, @acc[$i+1] - mov @acc[$i], 8*$i($out_ptr) -___ -} -$code.=<<___; - imulq $fx - add %rax, @acc[$i] - adc \$0, %rdx - - mov @acc[5], 8*5($out_ptr) - mov %rdx, 8*6($out_ptr) - sar \$63, %rdx # sign extension - mov %rdx, 8*7($out_ptr) -___ -{ -my $fx=$in_ptr; -$code.=<<___; - mov $g0, $f0 # load |g0| - - mov 8*0($in_ptr), @acc[0] # load |v| - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - mov 8*6($in_ptr), @acc[6] - mov 8*7($in_ptr), @acc[7] - mov 8*8($in_ptr), @acc[8] - mov 8*9($in_ptr), @acc[9] - mov 8*10($in_ptr), @acc[10] - mov 8*11($in_ptr), @acc[11] - - mov $f0, $fx # overrides in_ptr - sar \$63, $f0 # |g0|'s sign as mask - xor %rax, %rax - sub $f0, %rax # |g0|'s sign as bit - - xor $f0, $fx # conditionally negate |g0| - add %rax, $fx - - xor $f0, @acc[0] # conditionally negate |v| - xor $f0, @acc[1] - xor $f0, @acc[2] - xor $f0, @acc[3] - xor $f0, @acc[4] - xor $f0, @acc[5] - xor $f0, @acc[6] - xor $f0, @acc[7] - xor $f0, @acc[8] - xor $f0, @acc[9] - xor $f0, @acc[10] - xor $f0, @acc[11] - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - adc \$0, @acc[6] - adc \$0, @acc[7] - adc \$0, @acc[8] - adc \$0, @acc[9] - adc \$0, @acc[10] - adc \$0, @acc[11] - - mulq $fx # |v|*|g0| - mov %rax, @acc[0] - mov @acc[1], %rax - mov %rdx, @acc[1] -___ -for($i=1; $i<11; $i++) { -$code.=<<___; - mulq $fx - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov %rdx, @acc[$i+1] -___ -} -$code.=<<___; - mov 8*1(%rsp), %rdx # out_ptr - imulq $fx, %rax - mov 8*2(%rsp), $in_ptr # restore original in_ptr - add @acc[11], %rax - - add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| - adc 8*1(%rdx), @acc[1] - adc 8*2(%rdx), @acc[2] - adc 8*3(%rdx), @acc[3] - adc 8*4(%rdx), @acc[4] - adc 8*5(%rdx), @acc[5] - adc 8*6(%rdx), @acc[6] - mov 8*7(%rdx), @acc[11] # sign extension - adc @acc[11], @acc[7] - adc @acc[11], @acc[8] - adc @acc[11], @acc[9] - adc @acc[11], @acc[10] - adc @acc[11], %rax - - mov %rdx, $out_ptr # restore original out_ptr - - mov @acc[0], 8*0(%rdx) - mov @acc[1], 8*1(%rdx) - mov @acc[2], 8*2(%rdx) - mov @acc[3], 8*3(%rdx) - mov @acc[4], 8*4(%rdx) - mov @acc[5], 8*5(%rdx) - mov @acc[6], 8*6(%rdx) - mov @acc[7], 8*7(%rdx) - mov @acc[8], 8*8(%rdx) - mov @acc[9], 8*9(%rdx) - mov @acc[10], 8*10(%rdx) - mov %rax, 8*11(%rdx) - - ret -.size __smulq_767x63,.-__smulq_767x63 -___ -} -$code.=<<___; -.type __smulq_383x63,\@abi-omnipotent -.align 32 -__smulq_383x63: -___ -for($j=0; $j<2; $j++) { -$code.=<<___; - mov 8*0($in_ptr), @acc[0] # load |u| (or |v|) - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - - mov %rdx, $fx - sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) - xor %rax, %rax - sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) - - xor %rdx, $fx # conditionally negate |f0| - add %rax, $fx - - xor %rdx, @acc[0] # conditionally negate |u| (or |v|) - xor %rdx, @acc[1] - xor %rdx, @acc[2] - xor %rdx, @acc[3] - xor %rdx, @acc[4] - xor %rdx, @acc[5] - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - - mulq $fx # |u|*|f0| (or |v|*|g0|) - mov %rax, @acc[0] - mov @acc[1], %rax - mov %rdx, @acc[1] -___ -for($i=1; $i<5; $i++) { -$code.=<<___; - mulq $fx - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov %rdx, @acc[$i+1] -___ -} -$code.=<<___ if ($j==0); - imulq $fx, %rax - add %rax, @acc[$i] - - lea 8*6($in_ptr), $in_ptr # pointer to |v| - mov $g0, %rdx - - mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) -___ -} -$code.=<<___; - imulq $fx, %rax - add %rax, @acc[$i] - - lea -8*6($in_ptr), $in_ptr # restore original in_ptr - - add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| - adc 8*1($out_ptr), @acc[1] - adc 8*2($out_ptr), @acc[2] - adc 8*3($out_ptr), @acc[3] - adc 8*4($out_ptr), @acc[4] - adc 8*5($out_ptr), @acc[5] - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) - - ret -.size __smulq_383x63,.-__smulq_383x63 -___ -{ -$code.=<<___; -.type __smulq_383_n_shift_by_62,\@abi-omnipotent -.align 32 -__smulq_383_n_shift_by_62: - mov $f0, @acc[8] -___ -my $f0 = @acc[8]; -for($j=0; $j<2; $j++) { -$code.=<<___; - mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - - mov %rdx, $fx - sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) - xor %rax, %rax - sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) - - xor %rdx, $fx # conditionally negate |f0| (or |g0|) - add %rax, $fx - - xor %rdx, @acc[0] # conditionally negate |a| (or |b|) - xor %rdx, @acc[1] - xor %rdx, @acc[2] - xor %rdx, @acc[3] - xor %rdx, @acc[4] - xor %rdx, @acc[5] - add @acc[0], %rax - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - - mulq $fx # |a|*|f0| (or |b|*|g0|) - mov %rax, @acc[0] - mov @acc[1], %rax - mov %rdx, @acc[1] -___ -for($i=1; $i<5; $i++) { -$code.=<<___; - mulq $fx - add %rax, @acc[$i] - mov @acc[$i+1], %rax - adc \$0, %rdx - mov %rdx, @acc[$i+1] -___ -} -$code.=<<___ if ($j==0); - imulq $fx - add %rax, @acc[$i] - adc \$0, %rdx - - lea 8*6($in_ptr), $in_ptr # pointer to |b| - mov %rdx, @acc[6] - mov $g0, %rdx - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) -___ -} -$code.=<<___; - imulq $fx - add %rax, @acc[$i] - adc \$0, %rdx - - lea -8*6($in_ptr), $in_ptr # restore original in_ptr - - add 8*0($out_ptr), @acc[0] - adc 8*1($out_ptr), @acc[1] - adc 8*2($out_ptr), @acc[2] - adc 8*3($out_ptr), @acc[3] - adc 8*4($out_ptr), @acc[4] - adc 8*5($out_ptr), @acc[5] - adc %rdx, @acc[6] - mov $f0, %rdx - - shrd \$62, @acc[1], @acc[0] - shrd \$62, @acc[2], @acc[1] - shrd \$62, @acc[3], @acc[2] - shrd \$62, @acc[4], @acc[3] - shrd \$62, @acc[5], @acc[4] - shrd \$62, @acc[6], @acc[5] - - sar \$63, @acc[6] # sign as mask - xor $fx, $fx - sub @acc[6], $fx # sign as bit - - xor @acc[6], @acc[0] # conditionally negate the result - xor @acc[6], @acc[1] - xor @acc[6], @acc[2] - xor @acc[6], @acc[3] - xor @acc[6], @acc[4] - xor @acc[6], @acc[5] - add $fx, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) - - xor @acc[6], %rdx # conditionally negate |f0| - xor @acc[6], $g0 # conditionally negate |g0| - add $fx, %rdx - add $fx, $g0 - - ret -.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 -___ -} } - -{ -my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); -my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi"); -{ -my @a = ($a_lo, $t1, $a_hi); -my @b = ($b_lo, $t2, $b_hi); - -$code.=<<___; -.type __ab_approximation_62,\@abi-omnipotent -.align 32 -__ab_approximation_62: - mov 8*5($in_ptr), @a[2] # load |a| in reverse order - mov 8*11($in_ptr), @b[2] # load |b| in reverse order - mov 8*4($in_ptr), @a[1] - mov 8*10($in_ptr), @b[1] - mov 8*3($in_ptr), @a[0] - mov 8*9($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # check top-most limbs, ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - cmovz @b[0], @b[1] - mov 8*2($in_ptr), @a[0] - mov 8*8($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # ... ones before top-most, ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - cmovz @b[0], @b[1] - mov 8*1($in_ptr), @a[0] - mov 8*7($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # ... and ones before that ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - cmovz @b[0], @b[1] - mov 8*0($in_ptr), @a[0] - mov 8*6($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 - bsr $t0, %rcx - lea 1(%rcx), %rcx - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz $t0, %rcx - neg %rcx - #and \$63, %rcx # debugging artefact - - shldq %cl, @a[1], @a[2] # align second limb to the left - shldq %cl, @b[1], @b[2] - - jmp __inner_loop_62 - - ret -.size __ab_approximation_62,.-__ab_approximation_62 -___ -} -$code.=<<___; -.type __inner_loop_62,\@abi-omnipotent -.align 8 -.long 0 -__inner_loop_62: - mov \$1, $f0 # |f0|=1 - xor $g0, $g0 # |g0|=0 - xor $f1, $f1 # |f1|=0 - mov \$1, $g1 # |g1|=1 - mov $in_ptr, 8(%rsp) - -.Loop_62: - xor $t0, $t0 - xor $t1, $t1 - test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| - mov $b_lo, $t2 - mov $b_hi, $t3 - cmovnz $b_lo, $t0 - cmovnz $b_hi, $t1 - sub $a_lo, $t2 # |b_|-|a_| - sbb $a_hi, $t3 - mov $a_lo, $t4 - mov $a_hi, $t5 - sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) - sbb $t1, $a_hi - cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| - cmovc $t3, $a_hi - cmovc $t4, $b_lo # |b_| = |a_| - cmovc $t5, $b_hi - mov $f0, $t0 # exchange |f0| and |f1| - cmovc $f1, $f0 - cmovc $t0, $f1 - mov $g0, $t1 # exchange |g0| and |g1| - cmovc $g1, $g0 - cmovc $t1, $g1 - xor $t0, $t0 - xor $t1, $t1 - shrd \$1, $a_hi, $a_lo - shr \$1, $a_hi - test \$1, $t4 # if |a_| was odd, then we'll be subtracting... - cmovnz $f1, $t0 - cmovnz $g1, $t1 - add $f1, $f1 # |f1|<<=1 - add $g1, $g1 # |g1|<<=1 - sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) - sub \$1, $cnt - jnz .Loop_62 - - mov 8(%rsp), $in_ptr - ret -.size __inner_loop_62,.-__inner_loop_62 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl b/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl deleted file mode 100755 index d207e2f5a7c..00000000000 --- a/crypto/blst_src/asm/ctx_inverse_mod_384-x86_64.pl +++ /dev/null @@ -1,995 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Both constant-time and fast Euclidean inversion as suggested in -# https://eprint.iacr.org/2020/972. Performance is >4x better than -# modulus-specific FLT addition chain... -# -# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); -# -$python_ref.=<<'___'; -def ct_inverse_mod_383(inp, mod): - a, u = inp, 1 - b, v = mod, 0 - - k = 31 - mask = (1 << k) - 1 - - for i in range(0, 766 // k): - # __ab_approximation_31 - n = max(a.bit_length(), b.bit_length()) - if n < 64: - a_, b_ = a, b - else: - a_ = (a & mask) | ((a >> (n-k-2)) << k) - b_ = (b & mask) | ((b >> (n-k-2)) << k) - - # __inner_loop_31 - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, k): - if a_ & 1: - if a_ < b_: - a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 - a_, f0, g0 = a_-b_, f0-f1, g0-g1 - a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 - - # __smulx_383_n_shift_by_31 - a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k - if a < 0: - a, f0, g0 = -a, -f0, -g0 - if b < 0: - b, f1, g1 = -b, -f1, -g1 - - # __smulx_767x63 - u, v = u*f0 + v*g0, u*f1 + v*g1 - - if 766 % k: - f0, g0, f1, g1 = 1, 0, 0, 1 - for j in range(0, 766 % k): - if a & 1: - if a < b: - a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 - a, f0, g0 = a-b, f0-f1, g0-g1 - a, f1, g1 = a >> 1, f1 << 1, g1 << 1 - - v = u*f1 + v*g1 - - if v < 0: - v += mod << (768 - mod.bit_length()) # left aligned - - return v & (2**768 - 1) # to be reduced % mod -___ - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); -my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); -my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); -my $cnt = "%edi"; - -$frame = 8*11+2*512; - -$code.=<<___; -.text - -.globl ctx_inverse_mod_383 -.type ctx_inverse_mod_383,\@function,4,"unwind" -.align 32 -ctx_inverse_mod_383: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot - and \$-512, %rax # in the frame... - mov $out_ptr, 8*4(%rsp) - mov $nx_ptr, 8*5(%rsp) - - mov 8*0($in_ptr), @acc[0] # load input - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - - mov 8*0($n_ptr), @acc[6] # load modulus - mov 8*1($n_ptr), @acc[7] - mov 8*2($n_ptr), @acc[8] - mov 8*3($n_ptr), @acc[9] - mov 8*4($n_ptr), @acc[10] - mov 8*5($n_ptr), @acc[11] - - mov @acc[0], 8*0(%rax) # copy input to |a| - mov @acc[1], 8*1(%rax) - mov @acc[2], 8*2(%rax) - mov @acc[3], 8*3(%rax) - mov @acc[4], 8*4(%rax) - mov @acc[5], 8*5(%rax) - - mov @acc[6], 8*6(%rax) # copy modulus to |b| - mov @acc[7], 8*7(%rax) - mov @acc[8], 8*8(%rax) - mov @acc[9], 8*9(%rax) - mov @acc[10], 8*10(%rax) - mov %rax, $in_ptr - mov @acc[11], 8*11(%rax) - - ################################# first iteration - mov \$31, $cnt - call __ab_approximation_31 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - mov $f1, 8*9(%rsp) - mov $g1, 8*10(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call __smulx_383_n_shift_by_31 - #mov $f0, 8*7(%rsp) # corrected |f0| - #mov $g0, 8*8(%rsp) # corrected |g0| - mov $f0, 8*12($out_ptr) # initialize |u| with |f0| - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr), $out_ptr # pointer to destination |b| - call __smulx_383_n_shift_by_31 - #mov $f0, 8*9(%rsp) # corrected |f1| - #mov $g0, 8*10(%rsp) # corrected |g1| - mov $f0, 8*12($out_ptr) # initialize |v| with |f1| - - ################################# second iteration - xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$31, $cnt - call __ab_approximation_31 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - mov $f1, 8*9(%rsp) - mov $g1, 8*10(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call __smulx_383_n_shift_by_31 - mov $f0, 8*7(%rsp) # corrected |f0| - mov $g0, 8*8(%rsp) # corrected |g0| - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr), $out_ptr # pointer to destination |b| - call __smulx_383_n_shift_by_31 - #mov $f0, 8*9(%rsp) # corrected |f1| - #mov $g0, 8*10(%rsp) # corrected |g1| - - mov 8*12($in_ptr), %rax # |u| - mov 8*18($in_ptr), @acc[3] # |v| - mov $f0, %rbx - mov %rax, @acc[2] - imulq 8*7(%rsp) # |u|*|f0| - mov %rax, @acc[0] - mov @acc[3], %rax - mov %rdx, @acc[1] - imulq 8*8(%rsp) # |v|*|g0| - add %rax, @acc[0] - adc %rdx, @acc[1] - mov @acc[0], 8*6($out_ptr) # destination |u| - mov @acc[1], 8*7($out_ptr) - sar \$63, @acc[1] # sign extension - mov @acc[1], 8*8($out_ptr) - mov @acc[1], 8*9($out_ptr) - mov @acc[1], 8*10($out_ptr) - mov @acc[1], 8*11($out_ptr) - lea 8*12($in_ptr), $in_ptr # make in_ptr "rewindable" with xor - - mov @acc[2], %rax - imulq %rbx # |u|*|f1| - mov %rax, @acc[0] - mov @acc[3], %rax - mov %rdx, @acc[1] - imulq %rcx # |v|*|g1| - add %rax, @acc[0] - adc %rdx, @acc[1] - mov @acc[0], 8*12($out_ptr) # destination |v| - mov @acc[1], 8*13($out_ptr) - sar \$63, @acc[1] # sign extension - mov @acc[1], 8*14($out_ptr) - mov @acc[1], 8*15($out_ptr) - mov @acc[1], 8*16($out_ptr) - mov @acc[1], 8*17($out_ptr) -___ -for($i=2; $i<23; $i++) { -my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31" - : "__smulx_191_n_shift_by_31"; -my $smul_767x63 = $i>11 ? "__smulx_767x63" - : "__smulx_383x63"; -$code.=<<___; - xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$31, $cnt - call __ab_approximation_31 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - mov $f1, 8*9(%rsp) - mov $g1, 8*10(%rsp) - - mov \$256, $out_ptr - xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| - call $smul_n_shift - mov $f0, 8*7(%rsp) # corrected |f0| - mov $g0, 8*8(%rsp) # corrected |g0| - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr), $out_ptr # pointer to destination |b| - call $smul_n_shift - mov $f0, 8*9(%rsp) # corrected |f1| - mov $g0, 8*10(%rsp) # corrected |g1| - - mov 8*7(%rsp), $f0 # |f0| - mov 8*8(%rsp), $g0 # |g0| - lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| - lea 8*6($out_ptr), $out_ptr # pointer to destination |u| - call __smulx_383x63 - - mov 8*9(%rsp), $f0 # |f1| - mov 8*10(%rsp), $g0 # |g1| - lea 8*6($out_ptr),$out_ptr # pointer to destination |v| - call $smul_767x63 -___ -$code.=<<___ if ($i==11); - sar \$63, @acc[5] # sign extension - mov @acc[5], 8*6($out_ptr) - mov @acc[5], 8*7($out_ptr) - mov @acc[5], 8*8($out_ptr) - mov @acc[5], 8*9($out_ptr) - mov @acc[5], 8*10($out_ptr) - mov @acc[5], 8*11($out_ptr) -___ -} -$code.=<<___; - ################################# two[!] last iterations in one go - xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| - mov \$53, $cnt # 31 + 766 % 31 - #call __ab_approximation_31 # |a| and |b| are exact, just load - mov 8*0($in_ptr), @acc[0] # |a_lo| - #xor @acc[1], @acc[1] # |a_hi| - mov 8*6($in_ptr), @acc[2] # |b_lo| - #xor @acc[3], @acc[3] # |b_hi| - call __inner_loop_62 - #mov $f0, 8*7(%rsp) - #mov $g0, 8*8(%rsp) - #mov $f1, 8*9(%rsp) - #mov $g1, 8*10(%rsp) - - #mov 8*7(%rsp), $f0 # |f0| - #mov 8*8(%rsp), $g0 # |g0| - lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| - #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| - #call __smulx_383x63 - - #mov 8*9(%rsp), $f0 # |f1| - #mov 8*10(%rsp), $g0 # |g1| - mov $f1, $f0 - mov $g1, $g0 - mov 8*4(%rsp), $out_ptr # original out_ptr - call __smulx_767x63 - - mov 8*5(%rsp), $in_ptr # original n_ptr - mov %rax, %rdx # top limb of the result - sar \$63, %rax # result's sign as mask - - mov %rax, @acc[0] # mask |modulus| - mov %rax, @acc[1] - mov %rax, @acc[2] - and 8*0($in_ptr), @acc[0] - and 8*1($in_ptr), @acc[1] - mov %rax, @acc[3] - and 8*2($in_ptr), @acc[2] - and 8*3($in_ptr), @acc[3] - mov %rax, @acc[4] - and 8*4($in_ptr), @acc[4] - and 8*5($in_ptr), %rax - - add @acc[0], @acc[6] # conditionally add |modulus|<<384 - adc @acc[1], @acc[7] - adc @acc[2], @acc[8] - adc @acc[3], @acc[9] - adc @acc[4], %rcx - adc %rax, %rdx - - mov @acc[6], 8*6($out_ptr) # store absolute value - mov @acc[7], 8*7($out_ptr) - mov @acc[8], 8*8($out_ptr) - mov @acc[9], 8*9($out_ptr) - mov %rcx, 8*10($out_ptr) - mov %rdx, 8*11($out_ptr) - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 -___ -######################################################################## -# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers -# to the maximum bit-length of the *result*, and "63" - to the maximum -# bit-length of the |f?| and |g?| single-limb multiplicands. However! -# The latter should not be taken literally, as they are always chosen so -# that "bad things" don't happen. For example, there comes a point when -# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we -# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is -# because past that point |f0| is always 1 and |g0| is always 0. And, -# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to -# perform full-width |u|*|f1| multiplication, half-width one with sign -# extension is sufficient... -{ -my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); -my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); -my $fx = @acc[9]; - -$code.=<<___; -.type __smulx_767x63,\@abi-omnipotent -.align 32 -__smulx_767x63: - mov 8*0($in_ptr), @acc[0] # load |u| - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - - mov $f0, %rax - sar \$63, %rax # |f0|'s sign as mask - xor $fx, $fx # overrides in_ptr - sub %rax, $fx # |f0|'s sign as bit - - mov $out_ptr, 8*1(%rsp) - mov $in_ptr, 8*2(%rsp) - lea 8*6($in_ptr), $in_ptr # pointer to |v| - - xor %rax, $f0 # conditionally negate |f0| - add $fx, $f0 - - xor %rax, @acc[0] # conditionally negate |u| - xor %rax, @acc[1] - xor %rax, @acc[2] - xor %rax, @acc[3] - xor %rax, @acc[4] - xor @acc[5], %rax - add $fx, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, %rax - - mulx @acc[0], @acc[0], $fx # |u|*|f0| - mulx @acc[1], @acc[1], @acc[5] - add $fx, @acc[1] -___ -for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { -$code.=<<___; - mulx @acc[$i], @acc[$i], $a - adc $b, @acc[$i] -___ - ($a, $b) = ($b, $a); -} -$code.=<<___; - adc \$0, $fx - imulq %rdx - add $fx, %rax - adc \$0, %rdx - - mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov %rax, 8*5($out_ptr) - mov %rdx, 8*6($out_ptr) - sar \$63, %rdx # sign extension - mov %rdx, 8*7($out_ptr) -___ -{ -my $fx=$in_ptr; -$code.=<<___; - mov $g0, $f0 # load |g0| - mov $g0, %rax - - mov 8*0($in_ptr), @acc[0] # load |v| - mov 8*1($in_ptr), @acc[1] - mov 8*2($in_ptr), @acc[2] - mov 8*3($in_ptr), @acc[3] - mov 8*4($in_ptr), @acc[4] - mov 8*5($in_ptr), @acc[5] - mov 8*6($in_ptr), @acc[6] - mov 8*7($in_ptr), @acc[7] - mov 8*8($in_ptr), @acc[8] - mov 8*9($in_ptr), @acc[9] - mov 8*10($in_ptr), @acc[10] - mov 8*11($in_ptr), @acc[11] - - sar \$63, %rax # |g0|'s sign as mask - xor $fx, $fx # overrides in_ptr - sub %rax, $fx # |g0|'s sign as bit - - xor %rax, $f0 # conditionally negate |g0| - add $fx, $f0 - - xor %rax, @acc[0] # conditionally negate |v| - xor %rax, @acc[1] - xor %rax, @acc[2] - xor %rax, @acc[3] - xor %rax, @acc[4] - xor %rax, @acc[5] - xor %rax, @acc[6] - xor %rax, @acc[7] - xor %rax, @acc[8] - xor %rax, @acc[9] - xor %rax, @acc[10] - xor %rax, @acc[11] - add $fx, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - adc \$0, @acc[6] - adc \$0, @acc[7] - adc \$0, @acc[8] - adc \$0, @acc[9] - adc \$0, @acc[10] - adc \$0, @acc[11] - - mulx @acc[0], @acc[0], %rax # |v|*|g0| - mulx @acc[1], @acc[1], $fx - add %rax, @acc[1] -___ -for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) { -$code.=<<___; - mulx @acc[$i], @acc[$i], $a - adc $b, @acc[$i] -___ - ($a, $b) = ($b, $a); -} -$code.=<<___; - mulx @acc[11], @acc[11], $fx - mov 8*1(%rsp), %rdx # out_ptr - mov 8*2(%rsp), $in_ptr # restore original in_ptr - adc @acc[11], %rax - - add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| - adc 8*1(%rdx), @acc[1] - adc 8*2(%rdx), @acc[2] - adc 8*3(%rdx), @acc[3] - adc 8*4(%rdx), @acc[4] - adc 8*5(%rdx), @acc[5] - adc 8*6(%rdx), @acc[6] - mov 8*7(%rdx), @acc[11] # sign extension - adc @acc[11], @acc[7] - adc @acc[11], @acc[8] - adc @acc[11], @acc[9] - adc @acc[11], @acc[10] - adc @acc[11], %rax - - mov %rdx, $out_ptr # restore original out_ptr - - mov @acc[0], 8*0(%rdx) - mov @acc[1], 8*1(%rdx) - mov @acc[2], 8*2(%rdx) - mov @acc[3], 8*3(%rdx) - mov @acc[4], 8*4(%rdx) - mov @acc[5], 8*5(%rdx) - mov @acc[6], 8*6(%rdx) - mov @acc[7], 8*7(%rdx) - mov @acc[8], 8*8(%rdx) - mov @acc[9], 8*9(%rdx) - mov @acc[10], 8*10(%rdx) - mov %rax, 8*11(%rdx) - - ret -.size __smulx_767x63,.-__smulx_767x63 -___ -} -$code.=<<___; -.type __smulx_383x63,\@abi-omnipotent -.align 32 -__smulx_383x63: -___ -for($j=0; $j<2; $j++) { -my $k = 8*6*$j; -$code.=<<___; - mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) - mov $k+8*1($in_ptr), @acc[1] - mov $k+8*2($in_ptr), @acc[2] - mov $k+8*3($in_ptr), @acc[3] - mov $k+8*4($in_ptr), @acc[4] - mov $k+8*5($in_ptr), @acc[5] - - mov $f0, $fx - sar \$63, $fx # |f0|'s sign as mask (or |g0|'s) - xor %rax, %rax - sub $fx, %rax # |f0|'s sign as bit (or |g0|'s) - - xor $fx, $f0 # conditionally negate |f0| - add %rax, $f0 - - xor $fx, @acc[0] # conditionally negate |u| (or |v|) - xor $fx, @acc[1] - xor $fx, @acc[2] - xor $fx, @acc[3] - xor $fx, @acc[4] - xor $fx, @acc[5] - add %rax, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - - mulx @acc[0], @acc[0], $fx # |u|*|f0| (or |v|*|g0|) - mulx @acc[1], @acc[1], %rax - add $fx, @acc[1] -___ -for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) { -$code.=<<___; - mulx @acc[$i], @acc[$i], $a - adc $b, @acc[$i] -___ - ($a, $b) = ($b, $a); -} -$code.=<<___ if ($j==0); - mulx @acc[$i], @acc[$i], %rax - mov $g0, $f0 - adc $fx, @acc[$i] - - mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) -___ -} -$code.=<<___; - mulx @acc[$i], @acc[$i], %rax - adc $fx, @acc[$i] - - add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| - adc 8*1($out_ptr), @acc[1] - adc 8*2($out_ptr), @acc[2] - adc 8*3($out_ptr), @acc[3] - adc 8*4($out_ptr), @acc[4] - adc 8*5($out_ptr), @acc[5] - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov @acc[5], 8*5($out_ptr) - - ret -.size __smulx_383x63,.-__smulx_383x63 -___ -######################################################################## -# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of -# the names refers to maximum bit-lengths of |a| and |b|. As already -# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always -# chosen so that "bad things" don't happen. For example, so that the -# sum of the products doesn't overflow, and that the final result is -# never wider than inputs... -{ -$code.=<<___; -.type __smulx_383_n_shift_by_31,\@abi-omnipotent -.align 32 -__smulx_383_n_shift_by_31: - mov $f0, @acc[8] - xor @acc[6], @acc[6] -___ -my $f0 = @acc[8]; -for($j=0; $j<2; $j++) { -my $k = 8*6*$j; -$code.=<<___; - mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) - mov $k+8*1($in_ptr), @acc[1] - mov $k+8*2($in_ptr), @acc[2] - mov $k+8*3($in_ptr), @acc[3] - mov $k+8*4($in_ptr), @acc[4] - mov $k+8*5($in_ptr), @acc[5] - - mov %rdx, %rax - sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) - xor $fx, $fx - sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) - - xor %rax, %rdx # conditionally negate |f0| (or |g0|) - add $fx, %rdx - - xor %rax, @acc[0] # conditionally negate |a| (or |b|) - xor %rax, @acc[1] - xor %rax, @acc[2] - xor %rax, @acc[3] - xor %rax, @acc[4] - xor @acc[5], %rax - add $fx, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, %rax - - mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) - mulx @acc[1], @acc[1], @acc[5] - add $fx, @acc[1] -___ -for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { -$code.=<<___; - mulx @acc[$i], @acc[$i], $a - adc $b, @acc[$i] -___ - ($a, $b) = ($b, $a); -} -$code.=<<___ if ($j==0); - adc \$0, $fx - imulq %rdx - add $fx, %rax - adc %rdx, @acc[6] - - mov $g0, %rdx - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov %rax, 8*5($out_ptr) -___ -} -$code.=<<___; - adc \$0, $fx - imulq %rdx - add $fx, %rax - adc \$0, %rdx - - add 8*0($out_ptr), @acc[0] - adc 8*1($out_ptr), @acc[1] - adc 8*2($out_ptr), @acc[2] - adc 8*3($out_ptr), @acc[3] - adc 8*4($out_ptr), @acc[4] - adc 8*5($out_ptr), %rax - adc %rdx, @acc[6] - mov $f0, %rdx - - shrd \$31, @acc[1], @acc[0] - shrd \$31, @acc[2], @acc[1] - shrd \$31, @acc[3], @acc[2] - shrd \$31, @acc[4], @acc[3] - shrd \$31, %rax, @acc[4] - shrd \$31, @acc[6], %rax - - sar \$63, @acc[6] # sign as mask - xor $fx, $fx - sub @acc[6], $fx # sign as bit - - xor @acc[6], @acc[0] # conditionally negate the result - xor @acc[6], @acc[1] - xor @acc[6], @acc[2] - xor @acc[6], @acc[3] - xor @acc[6], @acc[4] - xor @acc[6], %rax - add $fx, @acc[0] - adc \$0, @acc[1] - adc \$0, @acc[2] - adc \$0, @acc[3] - adc \$0, @acc[4] - adc \$0, %rax - - mov @acc[0], 8*0($out_ptr) - mov @acc[1], 8*1($out_ptr) - mov @acc[2], 8*2($out_ptr) - mov @acc[3], 8*3($out_ptr) - mov @acc[4], 8*4($out_ptr) - mov %rax, 8*5($out_ptr) - - xor @acc[6], %rdx # conditionally negate |f0| - xor @acc[6], $g0 # conditionally negate |g0| - add $fx, %rdx - add $fx, $g0 - - ret -.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 -___ -} { -$code.=<<___; -.type __smulx_191_n_shift_by_31,\@abi-omnipotent -.align 32 -__smulx_191_n_shift_by_31: - mov $f0, @acc[8] -___ -my $f0 = @acc[8]; -for($j=0; $j<2; $j++) { -my $k = 8*6*$j; -my @acc=@acc; - @acc=@acc[3..5] if ($j); -$code.=<<___; - mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) - mov $k+8*1($in_ptr), @acc[1] - mov $k+8*2($in_ptr), @acc[2] - - mov %rdx, %rax - sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) - xor $fx, $fx - sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) - - xor %rax, %rdx # conditionally negate |f0| (or |g0|) - add $fx, %rdx - - xor %rax, @acc[0] # conditionally negate |a| (or |b|) - xor %rax, @acc[1] - xor @acc[2], %rax - add $fx, @acc[0] - adc \$0, @acc[1] - adc \$0, %rax - - mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) - mulx @acc[1], @acc[1], @acc[2] - add $fx, @acc[1] - adc \$0, @acc[2] - imulq %rdx - add %rax, @acc[2] - adc \$0, %rdx -___ -$code.=<<___ if ($j==0); - mov %rdx, @acc[6] - mov $g0, %rdx -___ -} -$code.=<<___; - add @acc[0], @acc[3] - adc @acc[1], @acc[4] - adc @acc[2], @acc[5] - adc %rdx, @acc[6] - mov $f0, %rdx - - shrd \$31, @acc[4], @acc[3] - shrd \$31, @acc[5], @acc[4] - shrd \$31, @acc[6], @acc[5] - - sar \$63, @acc[6] # sign as mask - xor $fx, $fx - sub @acc[6], $fx # sign as bit - - xor @acc[6], @acc[3] # conditionally negate the result - xor @acc[6], @acc[4] - xor @acc[6], @acc[5] - add $fx, @acc[3] - adc \$0, @acc[4] - adc \$0, @acc[5] - - mov @acc[3], 8*0($out_ptr) - mov @acc[4], 8*1($out_ptr) - mov @acc[5], 8*2($out_ptr) - - xor @acc[6], %rdx # conditionally negate |f0| - xor @acc[6], $g0 # conditionally negate |g0| - add $fx, %rdx - add $fx, $g0 - - ret -.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 -___ -} } - -{ -my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); -my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); -my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); -my ($a_, $b_) = ($a_lo, $b_lo); -{ -my @a = ($a_lo, $t1, $a_hi); -my @b = ($b_lo, $t2, $b_hi); - -$code.=<<___; -.type __ab_approximation_31,\@abi-omnipotent -.align 32 -__ab_approximation_31: - mov 8*5($in_ptr), @a[2] # load |a| in reverse order - mov 8*11($in_ptr), @b[2] # load |b| in reverse order - mov 8*4($in_ptr), @a[1] - mov 8*10($in_ptr), @b[1] - mov 8*3($in_ptr), @a[0] - mov 8*9($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # check top-most limbs, ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - mov 8*2($in_ptr), @a[0] - cmovz @b[0], @b[1] - mov 8*8($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # ... ones before top-most, ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - mov 8*1($in_ptr), @a[0] - cmovz @b[0], @b[1] - mov 8*7($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # ... and ones before that ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - mov 8*0($in_ptr), @a[0] - cmovz @b[0], @b[1] - mov 8*6($in_ptr), @b[0] - - mov @a[2], $t0 - or @b[2], $t0 # ... and ones before that ... - cmovz @a[1], @a[2] - cmovz @b[1], @b[2] - cmovz @a[0], @a[1] - cmovz @b[0], @b[1] - - mov @a[2], $t0 - or @b[2], $t0 - bsr $t0, %rcx - lea 1(%rcx), %rcx - cmovz @a[0], @a[2] - cmovz @b[0], @b[2] - cmovz $t0, %rcx - neg %rcx - #and \$63, %rcx # debugging artefact - - shldq %cl, @a[1], @a[2] # align second limb to the left - shldq %cl, @b[1], @b[2] - - mov \$0x7FFFFFFF, %eax - and %rax, @a[0] - and %rax, @b[0] - andn @a[2], %rax, @a[2] - andn @b[2], %rax, @b[2] - or @a[2], @a[0] - or @b[2], @b[0] - - jmp __inner_loop_31 - - ret -.size __ab_approximation_31,.-__ab_approximation_31 -___ -} -$code.=<<___; -.type __inner_loop_31,\@abi-omnipotent -.align 32 -__inner_loop_31: ################# by Thomas Pornin - mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 - mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 - mov \$0x7FFFFFFF7FFFFFFF, $bias - -.Loop_31: - cmp $b_, $a_ # if |a_|<|b_|, swap the variables - mov $a_, $t0 - mov $b_, $t1 - mov $fg0, $t2 - mov $fg1, $t3 - cmovb $b_, $a_ - cmovb $t0, $b_ - cmovb $fg1, $fg0 - cmovb $t2, $fg1 - - sub $b_, $a_ # |a_|-|b_| - sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| - add $bias, $fg0 - - test \$1, $t0 # if |a_| was even, roll back - cmovz $t0, $a_ - cmovz $t1, $b_ - cmovz $t2, $fg0 - cmovz $t3, $fg1 - - shr \$1, $a_ # |a_|>>=1 - add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 - sub $bias, $fg1 - sub \$1, $cnt - jnz .Loop_31 - - shr \$32, $bias - mov %ecx, %edx # $fg0, $f0 - mov ${fg1}d, ${f1}d - shr \$32, $g0 - shr \$32, $g1 - sub $bias, $f0 # remove the bias - sub $bias, $g0 - sub $bias, $f1 - sub $bias, $g1 - - ret -.size __inner_loop_31,.-__inner_loop_31 - -.type __inner_loop_62,\@abi-omnipotent -.align 32 -__inner_loop_62: - mov \$1, $f0 # |f0|=1 - xor $g0, $g0 # |g0|=0 - xor $f1, $f1 # |f1|=0 - mov \$1, $g1 # |g1|=1 - -.Loop_62: - xor $t0, $t0 - test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| - mov $b_lo, $t1 - cmovnz $b_lo, $t0 - sub $a_lo, $t1 # |b_|-|a_| - mov $a_lo, $t2 - sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) - cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| - cmovc $t2, $b_lo # |b_| = |a_| - mov $f0, $t0 # exchange |f0| and |f1| - cmovc $f1, $f0 - cmovc $t0, $f1 - mov $g0, $t1 # exchange |g0| and |g1| - cmovc $g1, $g0 - cmovc $t1, $g1 - xor $t0, $t0 - xor $t1, $t1 - shr \$1, $a_lo - test \$1, $t2 # if |a_| was odd, then we'll be subtracting... - cmovnz $f1, $t0 - cmovnz $g1, $t1 - add $f1, $f1 # |f1|<<=1 - add $g1, $g1 # |g1|<<=1 - sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) - sub \$1, $cnt - jnz .Loop_62 - - ret -.size __inner_loop_62,.-__inner_loop_62 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/div3w-armv8.pl b/crypto/blst_src/asm/div3w-armv8.pl deleted file mode 100755 index bfa32453c3a..00000000000 --- a/crypto/blst_src/asm/div3w-armv8.pl +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -$code.=<<___; -.text - -.globl div_3_limbs -.type div_3_limbs,%function -.align 5 -div_3_limbs: - ldp x4,x5,[x0] // load R - eor x0,x0,x0 // Q = 0 - mov x3,#64 // loop counter - nop - -.Loop: - subs x6,x4,x1 // R - D - add x0,x0,x0 // Q <<= 1 - sbcs x7,x5,x2 - add x0,x0,#1 // Q + speculative bit - csel x4,x4,x6,lo // select between R and R - D - extr x1,x2,x1,#1 // D >>= 1 - csel x5,x5,x7,lo - lsr x2,x2,#1 - sbc x0,x0,xzr // subtract speculative bit - sub x3,x3,#1 - cbnz x3,.Loop - - asr x3,x0,#63 // top bit -> mask - add x0,x0,x0 // Q <<= 1 - subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + specilative bit - sbcs x7,x5,x2 - sbc x0,x0,xzr // subtract speculative bit - - orr x0,x0,x3 // all ones if overflow - - ret -.size div_3_limbs,.-div_3_limbs -___ -{ -my ($div_rem, $divisor, $quot) = map("x$_",(0..2)); -my @div = map("x$_",(3..4)); -my @acc = map("x$_",(5..7)); -my @t = map("x$_",(8..11)); - -$code.=<<___; -.globl quot_rem_128 -.type quot_rem_128,%function -.align 5 -quot_rem_128: - ldp @div[0],@div[1],[$divisor] - - mul @acc[0],@div[0],$quot // divisor[0:1} * quotient - umulh @acc[1],@div[0],$quot - mul @t[3], @div[1],$quot - umulh @acc[2],@div[1],$quot - - ldp @t[0],@t[1],[$div_rem] // load 3 limbs of the dividend - ldr @t[2],[$div_rem,#16] - - adds @acc[1],@acc[1],@t[3] - adc @acc[2],@acc[2],xzr - - subs @t[0],@t[0],@acc[0] // dividend - divisor * quotient - sbcs @t[1],@t[1],@acc[1] - sbcs @t[2],@t[2],@acc[2] - sbc @acc[0],xzr,xzr // borrow -> mask - - add $quot,$quot,@acc[0] // if borrowed, adjust the quotient ... - and @div[0],@div[0],@acc[0] - and @div[1],@div[1],@acc[0] - adds @t[0],@t[0],@div[0] // ... and add divisor - adc @t[1],@t[1],@div[1] - - stp @t[0],@t[1],[$div_rem] // save 2 limbs of the remainder - str $quot,[$div_rem,#16] // and one limb of the quotient - - mov x0,$quot // return adjusted quotient - - ret -.size quot_rem_128,.-quot_rem_128 - -.globl quot_rem_64 -.type quot_rem_64,%function -.align 5 -quot_rem_64: - ldr @div[0],[$divisor] - ldr @t[0],[$div_rem] // load 1 limb of the dividend - - mul @acc[0],@div[0],$quot // divisor * quotient - - sub @t[0],@t[0],@acc[0] // dividend - divisor * quotient - - stp @t[0],$quot,[$div_rem] // save remainder and quotient - - mov x0,$quot // return quotient - - ret -.size quot_rem_64,.-quot_rem_64 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/div3w-x86_64.pl b/crypto/blst_src/asm/div3w-x86_64.pl deleted file mode 100755 index b8192db8e6d..00000000000 --- a/crypto/blst_src/asm/div3w-x86_64.pl +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -$c_ref=<<'___'; -/* - * |div_top| points at two most significant limbs of the dividend, |d_hi| - * and |d_lo| are two most significant limbs of the divisor. If divisor - * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. - * The divisor is required to be "bitwise left-aligned," and dividend's - * top limbs to be not larger than the divisor's. The latter limitation - * can be problematic in the first iteration of multi-precision division, - * where in most general case the condition would have to be "smaller." - * The subroutine considers four limbs, two of which are "overlapping," - * hence the name... Another way to look at it is to think of the pair - * of the dividend's limbs being suffixed with a zero: - * +-------+-------+-------+ - * R | | | 0 | - * +-------+-------+-------+ - * +-------+-------+ - * D | | | - * +-------+-------+ - */ -limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi) -{ - llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0]; - llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo; - limb_t Q = 0, mask; - size_t i; - - for (i = 0; i < LIMB_BITS; i++) { - Q <<= 1; - mask = (R >= D); - Q |= mask; - R -= (D & ((llimb_t)0 - mask)); - D >>= 1; - } - - mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */ - - Q <<= 1; - Q |= (R >= D); - - return (Q | mask); -} -___ - -$code.=<<___; -.text - -.globl div_3_limbs -.hidden div_3_limbs -.type div_3_limbs,\@function,3 -.align 32 -div_3_limbs: - mov (%rdi),%r8 # load R.lo - mov 8(%rdi),%r9 # load R.hi - xor %rax,%rax # Q = 0 - mov \$64,%ecx # loop counter - -.Loop: - mov %r8,%r10 # put aside R - sub %rsi,%r8 # R -= D - mov %r9,%r11 - sbb %rdx,%r9 - lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit - mov %rdx,%rdi - cmovc %r10,%r8 # restore R if R - D borrowed - cmovc %r11,%r9 - sbb \$0,%rax # subtract speculative bit - shl \$63,%rdi - shr \$1,%rsi - shr \$1,%rdx - or %rdi,%rsi # D >>= 1 - sub \$1,%ecx - jnz .Loop - - lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit - sar \$63,%rax # top bit -> mask - - sub %rsi,%r8 # R -= D - sbb %rdx,%r9 - sbb \$0,%rcx # subtract speculative bit - - or %rcx,%rax # all ones if overflow - - ret -.size div_3_limbs,.-div_3_limbs -___ -######################################################################## -# Calculate remainder and adjust the quotient, which can be off-by-one. -# Then save quotient in limb next to top limb of the remainder. There is -# place, because the remainder/next-iteration-dividend gets shorter by -# one limb. -{ -my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx"); -my @acc = ("%r8", "%r9", "%rdx"); -my @tmp = ("%r10", "%r11", "%rax"); - -$code.=<<___; -.globl quot_rem_128 -.hidden quot_rem_128 -.type quot_rem_128,\@function,3 -.align 32 -quot_rem_128: - mov %rdx, %rax - mov %rdx, $quotient - - mulq 0($divisor) # divisor[0:1] * quotient - mov %rax, @acc[0] - mov $quotient, %rax - mov %rdx, @acc[1] - - mulq 8($divisor) - add %rax, @acc[1] - adc \$0, %rdx # %rdx is @acc[2] - - mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend - mov 8($div_rem), @tmp[1] - mov 16($div_rem), @tmp[2] - - sub @acc[0], @tmp[0] # dividend - divisor * quotient - sbb @acc[1], @tmp[1] - sbb @acc[2], @tmp[2] - sbb @acc[0], @acc[0] # borrow -> mask - - add @acc[0], $quotient # if borrowed, adjust the quotient ... - mov @acc[0], @acc[1] - and 0($divisor), @acc[0] - and 8($divisor), @acc[1] - add @acc[0], @tmp[0] # ... and add divisor - adc @acc[1], @tmp[1] - - mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ... - mov @tmp[1], 8($div_rem) - mov $quotient, 16($div_rem) # ... and 1 limb of the quotient - - mov $quotient, %rax # return adjusted quotient - - ret -.size quot_rem_128,.-quot_rem_128 - -######################################################################## -# Unlike 128-bit case above, quotient is exact. As result just one limb -# of the dividend is sufficient to calculate the remainder... - -.globl quot_rem_64 -.hidden quot_rem_64 -.type quot_rem_64,\@function,3 -.align 32 -quot_rem_64: - mov %rdx, %rax # return quotient - imulq 0($divisor), %rdx # divisor[0] * quotient - - mov 0($div_rem), @tmp[0] # load 1 limb of the dividend - - sub %rdx, @tmp[0] # dividend - divisor * quotient - - mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ... - mov %rax, 8($div_rem) # ... and 1 limb of the quotient - - ret -.size quot_rem_64,.-quot_rem_64 -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/mul_mont_256-armv8.pl b/crypto/blst_src/asm/mul_mont_256-armv8.pl deleted file mode 100755 index ba6c2b87980..00000000000 --- a/crypto/blst_src/asm/mul_mont_256-armv8.pl +++ /dev/null @@ -1,409 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# As for "sparse" in subroutine names, see commentary in the -# asm/mulx_mont_256-x86_64.pl module. - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); - -@mod=map("x$_",(5..8)); -$bi="x9"; -@a=map("x$_",(10..13)); -@tmp=map("x$_",(14..17)); -@acc=map("x$_",(19..24)); -$m0=$n_ptr; - -$code.=<<___; -.text - -.globl mul_mont_sparse_256 -.hidden mul_mont_sparse_256 -.type mul_mont_sparse_256,%function -.align 5 -mul_mont_sparse_256: - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp @a[0],@a[1],[$a_ptr] - ldr $bi, [$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - - mul @acc[0],@a[0],$bi - ldp @mod[0],@mod[1],[$n_ptr] - mul @acc[1],@a[1],$bi - ldp @mod[2],@mod[3],[$n_ptr,#16] - mul @acc[2],@a[2],$bi - mul @acc[3],@a[3],$bi - - umulh @tmp[0],@a[0],$bi - umulh @tmp[1],@a[1],$bi - mul $m0,$n0,@acc[0] - umulh @tmp[2],@a[2],$bi - umulh @tmp[3],@a[3],$bi - adds @acc[1],@acc[1],@tmp[0] - //mul @tmp[0],@mod[0],$m0 - adcs @acc[2],@acc[2],@tmp[1] - mul @tmp[1],@mod[1],$m0 - adcs @acc[3],@acc[3],@tmp[2] - mul @tmp[2],@mod[2],$m0 - adc @acc[4],xzr, @tmp[3] - mul @tmp[3],@mod[3],$m0 -___ -for ($i=1;$i<4;$i++) { -$code.=<<___; - ldr $bi,[$b_ptr,8*$i] - subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@mod[0],$m0 - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@mod[1],$m0 - adcs @acc[2],@acc[2],@tmp[2] - umulh @tmp[2],@mod[2],$m0 - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@mod[3],$m0 - adc @acc[4],@acc[4],xzr - - adds @acc[0],@acc[1],@tmp[0] - mul @tmp[0],@a[0],$bi - adcs @acc[1],@acc[2],@tmp[1] - mul @tmp[1],@a[1],$bi - adcs @acc[2],@acc[3],@tmp[2] - mul @tmp[2],@a[2],$bi - adcs @acc[3],@acc[4],@tmp[3] - mul @tmp[3],@a[3],$bi - adc @acc[4],xzr,xzr - - adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@a[0],$bi - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@a[1],$bi - adcs @acc[2],@acc[2],@tmp[2] - mul $m0,$n0,@acc[0] - umulh @tmp[2],@a[2],$bi - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@a[3],$bi - adc @acc[4],@acc[4],xzr - - adds @acc[1],@acc[1],@tmp[0] - //mul @tmp[0],@mod[0],$m0 - adcs @acc[2],@acc[2],@tmp[1] - mul @tmp[1],@mod[1],$m0 - adcs @acc[3],@acc[3],@tmp[2] - mul @tmp[2],@mod[2],$m0 - adc @acc[4],@acc[4],@tmp[3] - mul @tmp[3],@mod[3],$m0 -___ -} -$code.=<<___; - subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@mod[0],$m0 - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@mod[1],$m0 - adcs @acc[2],@acc[2],@tmp[2] - umulh @tmp[2],@mod[2],$m0 - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@mod[3],$m0 - adc @acc[4],@acc[4],xzr - - adds @acc[0],@acc[1],@tmp[0] - adcs @acc[1],@acc[2],@tmp[1] - adcs @acc[2],@acc[3],@tmp[2] - adcs @acc[3],@acc[4],@tmp[3] - adc @acc[4],xzr,xzr - - subs @tmp[0],@acc[0],@mod[0] - sbcs @tmp[1],@acc[1],@mod[1] - sbcs @tmp[2],@acc[2],@mod[2] - sbcs @tmp[3],@acc[3],@mod[3] - sbcs xzr, @acc[4],xzr - - csel @acc[0],@acc[0],@tmp[0],lo - csel @acc[1],@acc[1],@tmp[1],lo - csel @acc[2],@acc[2],@tmp[2],lo - csel @acc[3],@acc[3],@tmp[3],lo - - stp @acc[0],@acc[1],[$r_ptr] - stp @acc[2],@acc[3],[$r_ptr,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - ret -.size mul_mont_sparse_256,.-mul_mont_sparse_256 -___ -{ -my @acc = (@a,@acc[0..3]); -my @a = @mod; - -$code.=<<___; -.globl sqr_mont_sparse_256 -.hidden sqr_mont_sparse_256 -.type sqr_mont_sparse_256,%function -.align 5 -sqr_mont_sparse_256: - paciasp - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - mov $n0,$n_ptr - - //////////////////////////////////////////////////////////////// - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x] - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul @acc[1],@a[1],@a[0] // a[1]*a[0] - umulh @tmp[1],@a[1],@a[0] - mul @acc[2],@a[2],@a[0] // a[2]*a[0] - umulh @tmp[2],@a[2],@a[0] - mul @acc[3],@a[3],@a[0] // a[3]*a[0] - umulh @acc[4],@a[3],@a[0] - - adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication - mul @tmp[0],@a[2],@a[1] // a[2]*a[1] - umulh @tmp[1],@a[2],@a[1] - adcs @acc[3],@acc[3],@tmp[2] - mul @tmp[2],@a[3],@a[1] // a[3]*a[1] - umulh @tmp[3],@a[3],@a[1] - adc @acc[4],@acc[4],xzr // can't overflow - - mul @acc[5],@a[3],@a[2] // a[3]*a[2] - umulh @acc[6],@a[3],@a[2] - - adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication - mul @acc[0],@a[0],@a[0] // a[0]*a[0] - adc @tmp[2],@tmp[3],xzr // can't overflow - - adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication - umulh @a[0],@a[0],@a[0] - adcs @acc[4],@acc[4],@tmp[1] - mul @tmp[1],@a[1],@a[1] // a[1]*a[1] - adcs @acc[5],@acc[5],@tmp[2] - umulh @a[1],@a[1],@a[1] - adc @acc[6],@acc[6],xzr // can't overflow - - adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2 - mul @tmp[2],@a[2],@a[2] // a[2]*a[2] - adcs @acc[2],@acc[2],@acc[2] - umulh @a[2],@a[2],@a[2] - adcs @acc[3],@acc[3],@acc[3] - mul @tmp[3],@a[3],@a[3] // a[3]*a[3] - adcs @acc[4],@acc[4],@acc[4] - umulh @a[3],@a[3],@a[3] - adcs @acc[5],@acc[5],@acc[5] - adcs @acc[6],@acc[6],@acc[6] - adc @acc[7],xzr,xzr - - adds @acc[1],@acc[1],@a[0] // +a[i]*a[i] - adcs @acc[2],@acc[2],@tmp[1] - adcs @acc[3],@acc[3],@a[1] - adcs @acc[4],@acc[4],@tmp[2] - adcs @acc[5],@acc[5],@a[2] - adcs @acc[6],@acc[6],@tmp[3] - adc @acc[7],@acc[7],@a[3] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - adds @acc[0],@acc[0],@acc[4] // accumulate upper half - adcs @acc[1],@acc[1],@acc[5] - adcs @acc[2],@acc[2],@acc[6] - adcs @acc[3],@acc[3],@acc[7] - adc @acc[4],xzr,xzr - - subs @tmp[0],@acc[0],@mod[0] - sbcs @tmp[1],@acc[1],@mod[1] - sbcs @tmp[2],@acc[2],@mod[2] - sbcs @tmp[3],@acc[3],@mod[3] - sbcs xzr, @acc[4],xzr - - csel @acc[0],@acc[0],@tmp[0],lo - csel @acc[1],@acc[1],@tmp[1],lo - csel @acc[2],@acc[2],@tmp[2],lo - csel @acc[3],@acc[3],@tmp[3],lo - - stp @acc[0],@acc[1],[$r_ptr] - stp @acc[2],@acc[3],[$r_ptr,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - autiasp - ret -.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 -___ -} -{ -my @a = (@a, $bi); - -$code.=<<___; -.globl from_mont_256 -.hidden from_mont_256 -.type from_mont_256,%function -.align 5 -from_mont_256: - paciasp - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov $n0,$n_ptr - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - subs @tmp[0],@a[0],@mod[0] - sbcs @tmp[1],@a[1],@mod[1] - sbcs @tmp[2],@a[2],@mod[2] - sbcs @tmp[3],@a[3],@mod[3] - - csel @a[0],@a[0],@tmp[0],lo - csel @a[1],@a[1],@tmp[1],lo - csel @a[2],@a[2],@tmp[2],lo - csel @a[3],@a[3],@tmp[3],lo - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - - ldr x29,[sp],#16 - autiasp - ret -.size from_mont_256,.-from_mont_256 - -.globl redc_mont_256 -.hidden redc_mont_256 -.type redc_mont_256,%function -.align 5 -redc_mont_256: - paciasp - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov $n0,$n_ptr - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - ldp @tmp[0],@tmp[1],[$a_ptr,#32] - ldp @tmp[2],@tmp[3],[$a_ptr,#48] - - adds @a[0],@a[0],@tmp[0] - adcs @a[1],@a[1],@tmp[1] - adcs @a[2],@a[2],@tmp[2] - adcs @a[3],@a[3],@tmp[3] - adc @a[4],xzr,xzr - - subs @tmp[0],@a[0],@mod[0] - sbcs @tmp[1],@a[1],@mod[1] - sbcs @tmp[2],@a[2],@mod[2] - sbcs @tmp[3],@a[3],@mod[3] - sbcs xzr, @a[4],xzr - - csel @a[0],@a[0],@tmp[0],lo - csel @a[1],@a[1],@tmp[1],lo - csel @a[2],@a[2],@tmp[2],lo - csel @a[3],@a[3],@tmp[3],lo - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - - ldr x29,[sp],#16 - autiasp - ret -.size redc_mont_256,.-redc_mont_256 - -.type __mul_by_1_mont_256,%function -.align 5 -__mul_by_1_mont_256: - mul $m0,$n0,@a[0] - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] -___ -for ($i=1;$i<4;$i++) { -$code.=<<___; - //mul @tmp[0],@mod[0],$m0 - mul @tmp[1],@mod[1],$m0 - mul @tmp[2],@mod[2],$m0 - mul @tmp[3],@mod[3],$m0 - subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] - umulh @tmp[0],@mod[0],$m0 - adcs @a[1],@a[1],@tmp[1] - umulh @tmp[1],@mod[1],$m0 - adcs @a[2],@a[2],@tmp[2] - umulh @tmp[2],@mod[2],$m0 - adcs @a[3],@a[3],@tmp[3] - umulh @tmp[3],@mod[3],$m0 - adc @a[4],xzr,xzr - - adds @a[0],@a[1],@tmp[0] - adcs @a[1],@a[2],@tmp[1] - adcs @a[2],@a[3],@tmp[2] - mul $m0,$n0,@a[0] - adc @a[3],@a[4],@tmp[3] -___ -} -$code.=<<___; - //mul @tmp[0],@mod[0],$m0 - mul @tmp[1],@mod[1],$m0 - mul @tmp[2],@mod[2],$m0 - mul @tmp[3],@mod[3],$m0 - subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] - umulh @tmp[0],@mod[0],$m0 - adcs @a[1],@a[1],@tmp[1] - umulh @tmp[1],@mod[1],$m0 - adcs @a[2],@a[2],@tmp[2] - umulh @tmp[2],@mod[2],$m0 - adcs @a[3],@a[3],@tmp[3] - umulh @tmp[3],@mod[3],$m0 - adc @a[4],xzr,xzr - - adds @a[0],@a[1],@tmp[0] - adcs @a[1],@a[2],@tmp[1] - adcs @a[2],@a[3],@tmp[2] - adc @a[3],@a[4],@tmp[3] - - ret -.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 -___ -} - -print $code; - -close STDOUT; diff --git a/crypto/blst_src/asm/mul_mont_384-armv8.pl b/crypto/blst_src/asm/mul_mont_384-armv8.pl deleted file mode 100755 index 44e12a00b03..00000000000 --- a/crypto/blst_src/asm/mul_mont_384-armv8.pl +++ /dev/null @@ -1,2015 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); - -@mod = map("x$_",(5..10)); -@a = map("x$_",(11..16)); -$bi = "x17"; -@acc = map("x$_",(19..25)); -@tmp = map("x$_",(26..28,0,1,3)); - -$code.=<<___; -.text - -.globl add_mod_384x384 -.type add_mod_384x384,%function -.align 5 -add_mod_384x384: - paciasp - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - bl __add_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - autiasp - ret -.size add_mod_384x384,.-add_mod_384x384 - -.type __add_mod_384x384,%function -.align 5 -__add_mod_384x384: - ldp @a[0], @a[1], [$a_ptr] - ldp @acc[0],@acc[1],[$b_ptr] - ldp @a[2], @a[3], [$a_ptr,#16] - adds @a[0],@a[0],@acc[0] - ldp @acc[2],@acc[3],[$b_ptr,#16] - adcs @a[1],@a[1],@acc[1] - ldp @a[4], @a[5], [$a_ptr,#32] - adcs @a[2],@a[2],@acc[2] - ldp @acc[4],@acc[5],[$b_ptr,#32] - adcs @a[3],@a[3],@acc[3] - stp @a[0], @a[1], [$r_ptr] - adcs @a[4],@a[4],@acc[4] - ldp @a[0], @a[1], [$a_ptr,#48] - adcs @a[5],@a[5],@acc[5] - - ldp @acc[0],@acc[1],[$b_ptr,#48] - stp @a[2], @a[3], [$r_ptr,#16] - ldp @a[2], @a[3], [$a_ptr,#64] - ldp @acc[2],@acc[3],[$b_ptr,#64] - - adcs @a[0],@a[0],@acc[0] - stp @a[4], @a[5], [$r_ptr,#32] - adcs @a[1],@a[1],@acc[1] - ldp @a[4], @a[5], [$a_ptr,#80] - adcs @a[2],@a[2],@acc[2] - ldp @acc[4],@acc[5],[$b_ptr,#80] - adcs @a[3],@a[3],@acc[3] - adcs @a[4],@a[4],@acc[4] - adcs @a[5],@a[5],@acc[5] - adc $bi,xzr,xzr - - subs @acc[0],@a[0],@mod[0] - sbcs @acc[1],@a[1],@mod[1] - sbcs @acc[2],@a[2],@mod[2] - sbcs @acc[3],@a[3],@mod[3] - sbcs @acc[4],@a[4],@mod[4] - sbcs @acc[5],@a[5],@mod[5] - sbcs xzr,$bi,xzr - - csel @a[0],@a[0],@acc[0],lo - csel @a[1],@a[1],@acc[1],lo - csel @a[2],@a[2],@acc[2],lo - csel @a[3],@a[3],@acc[3],lo - stp @a[0],@a[1],[$r_ptr,#48] - csel @a[4],@a[4],@acc[4],lo - stp @a[2],@a[3],[$r_ptr,#64] - csel @a[5],@a[5],@acc[5],lo - stp @a[4],@a[5],[$r_ptr,#80] - - ret -.size __add_mod_384x384,.-__add_mod_384x384 - -.globl sub_mod_384x384 -.type sub_mod_384x384,%function -.align 5 -sub_mod_384x384: - paciasp - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - autiasp - ret -.size sub_mod_384x384,.-sub_mod_384x384 - -.type __sub_mod_384x384,%function -.align 5 -__sub_mod_384x384: - ldp @a[0], @a[1], [$a_ptr] - ldp @acc[0],@acc[1],[$b_ptr] - ldp @a[2], @a[3], [$a_ptr,#16] - subs @a[0],@a[0],@acc[0] - ldp @acc[2],@acc[3],[$b_ptr,#16] - sbcs @a[1],@a[1],@acc[1] - ldp @a[4], @a[5], [$a_ptr,#32] - sbcs @a[2],@a[2],@acc[2] - ldp @acc[4],@acc[5],[$b_ptr,#32] - sbcs @a[3],@a[3],@acc[3] - stp @a[0], @a[1], [$r_ptr] - sbcs @a[4],@a[4],@acc[4] - ldp @a[0], @a[1], [$a_ptr,#48] - sbcs @a[5],@a[5],@acc[5] - - ldp @acc[0],@acc[1],[$b_ptr,#48] - stp @a[2], @a[3], [$r_ptr,#16] - ldp @a[2], @a[3], [$a_ptr,#64] - ldp @acc[2],@acc[3],[$b_ptr,#64] - - sbcs @a[0],@a[0],@acc[0] - stp @a[4], @a[5], [$r_ptr,#32] - sbcs @a[1],@a[1],@acc[1] - ldp @a[4], @a[5], [$a_ptr,#80] - sbcs @a[2],@a[2],@acc[2] - ldp @acc[4],@acc[5],[$b_ptr,#80] - sbcs @a[3],@a[3],@acc[3] - sbcs @a[4],@a[4],@acc[4] - sbcs @a[5],@a[5],@acc[5] - sbc $bi,xzr,xzr - - and @acc[0],@mod[0],$bi - and @acc[1],@mod[1],$bi - adds @a[0],@a[0],@acc[0] - and @acc[2],@mod[2],$bi - adcs @a[1],@a[1],@acc[1] - and @acc[3],@mod[3],$bi - adcs @a[2],@a[2],@acc[2] - and @acc[4],@mod[4],$bi - adcs @a[3],@a[3],@acc[3] - and @acc[5],@mod[5],$bi - adcs @a[4],@a[4],@acc[4] - stp @a[0],@a[1],[$r_ptr,#48] - adc @a[5],@a[5],@acc[5] - stp @a[2],@a[3],[$r_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#80] - - ret -.size __sub_mod_384x384,.-__sub_mod_384x384 - -.type __add_mod_384,%function -.align 5 -__add_mod_384: - ldp @a[0], @a[1], [$a_ptr] - ldp @acc[0],@acc[1],[$b_ptr] - ldp @a[2], @a[3], [$a_ptr,#16] - adds @a[0],@a[0],@acc[0] - ldp @acc[2],@acc[3],[$b_ptr,#16] - adcs @a[1],@a[1],@acc[1] - ldp @a[4], @a[5], [$a_ptr,#32] - adcs @a[2],@a[2],@acc[2] - ldp @acc[4],@acc[5],[$b_ptr,#32] - adcs @a[3],@a[3],@acc[3] - adcs @a[4],@a[4],@acc[4] - adcs @a[5],@a[5],@acc[5] - adc $bi,xzr,xzr - - subs @acc[0],@a[0],@mod[0] - sbcs @acc[1],@a[1],@mod[1] - sbcs @acc[2],@a[2],@mod[2] - sbcs @acc[3],@a[3],@mod[3] - sbcs @acc[4],@a[4],@mod[4] - sbcs @acc[5],@a[5],@mod[5] - sbcs xzr,$bi,xzr - - csel @a[0],@a[0],@acc[0],lo - csel @a[1],@a[1],@acc[1],lo - csel @a[2],@a[2],@acc[2],lo - csel @a[3],@a[3],@acc[3],lo - csel @a[4],@a[4],@acc[4],lo - stp @a[0],@a[1],[$r_ptr] - csel @a[5],@a[5],@acc[5],lo - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ret -.size __add_mod_384,.-__add_mod_384 - -.type __sub_mod_384,%function -.align 5 -__sub_mod_384: - ldp @a[0], @a[1], [$a_ptr] - ldp @acc[0],@acc[1],[$b_ptr] - ldp @a[2], @a[3], [$a_ptr,#16] - subs @a[0],@a[0],@acc[0] - ldp @acc[2],@acc[3],[$b_ptr,#16] - sbcs @a[1],@a[1],@acc[1] - ldp @a[4], @a[5], [$a_ptr,#32] - sbcs @a[2],@a[2],@acc[2] - ldp @acc[4],@acc[5],[$b_ptr,#32] - sbcs @a[3],@a[3],@acc[3] - sbcs @a[4],@a[4],@acc[4] - sbcs @a[5],@a[5],@acc[5] - sbc $bi,xzr,xzr - - and @acc[0],@mod[0],$bi - and @acc[1],@mod[1],$bi - adds @a[0],@a[0],@acc[0] - and @acc[2],@mod[2],$bi - adcs @a[1],@a[1],@acc[1] - and @acc[3],@mod[3],$bi - adcs @a[2],@a[2],@acc[2] - and @acc[4],@mod[4],$bi - adcs @a[3],@a[3],@acc[3] - and @acc[5],@mod[5],$bi - adcs @a[4],@a[4],@acc[4] - stp @a[0],@a[1],[$r_ptr] - adc @a[5],@a[5],@acc[5] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ret -.size __sub_mod_384,.-__sub_mod_384 - -.globl mul_mont_384x -.hidden mul_mont_384x -.type mul_mont_384x,%function -.align 5 -mul_mont_384x: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#288 // space for 3 768-bit vectors - - mov @tmp[0],$r_ptr // save r_ptr - mov @tmp[1],$a_ptr // save b_ptr - mov @tmp[2],$b_ptr // save b_ptr - - sub $r_ptr,sp,#0 // mul_384(t0, a->re, b->re) - bl __mul_384 - - add $a_ptr,$a_ptr,#48 // mul_384(t1, a->im, b->im) - add $b_ptr,$b_ptr,#48 - add $r_ptr,sp,#96 - bl __mul_384 - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - sub $b_ptr,$a_ptr,#48 - add $r_ptr,sp,#240 - bl __add_mod_384 - - add $a_ptr,@tmp[2],#0 - add $b_ptr,@tmp[2],#48 - add $r_ptr,sp,#192 // t2 - bl __add_mod_384 - - add $a_ptr,$r_ptr,#0 - add $b_ptr,$r_ptr,#48 - bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - mov $a_ptr,$r_ptr - add $b_ptr,sp,#0 - bl __sub_mod_384x384 - - add $b_ptr,sp,#96 - bl __sub_mod_384x384 // t2 = t2-t0-t1 - - add $a_ptr,sp,#0 - add $b_ptr,sp,#96 - add $r_ptr,sp,#0 - bl __sub_mod_384x384 // t0 = t0-t1 - - add $a_ptr,sp,#0 // ret->re = redc(t0) - add $r_ptr,@tmp[0],#0 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - - add $a_ptr,sp,#192 // ret->im = redc(t2) - add $r_ptr,$r_ptr,#48 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#288 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size mul_mont_384x,.-mul_mont_384x - -.globl sqr_mont_384x -.hidden sqr_mont_384x -.type sqr_mont_384x,%function -.align 5 -sqr_mont_384x: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 2 384-bit vectors - mov $n0,$n_ptr // adjust for missing b_ptr - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - add $b_ptr,$a_ptr,#48 - add $r_ptr,sp,#0 - bl __add_mod_384 // t0 = a->re + a->im - - add $r_ptr,sp,#48 - bl __sub_mod_384 // t1 = a->re - a->im - - ldp @a[0],@a[1],[$a_ptr] - ldr $bi, [$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) - - adds @a[0],@a[0],@a[0] // add with itself - adcs @a[1],@a[1],@a[1] - adcs @a[2],@a[2],@a[2] - adcs @a[3],@a[3],@a[3] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adc @acc[6],xzr,xzr - - subs @acc[0],@a[0],@mod[0] - sbcs @acc[1],@a[1],@mod[1] - sbcs @acc[2],@a[2],@mod[2] - sbcs @acc[3],@a[3],@mod[3] - sbcs @acc[4],@a[4],@mod[4] - sbcs @acc[5],@a[5],@mod[5] - sbcs xzr,@acc[6],xzr - - csel @acc[0],@a[0],@acc[0],lo - csel @acc[1],@a[1],@acc[1],lo - csel @acc[2],@a[2],@acc[2],lo - ldp @a[0],@a[1],[sp] - csel @acc[3],@a[3],@acc[3],lo - ldr $bi, [sp,#48] - csel @acc[4],@a[4],@acc[4],lo - ldp @a[2],@a[3],[sp,#16] - csel @acc[5],@a[5],@acc[5],lo - ldp @a[4],@a[5],[sp,#32] - - stp @acc[0],@acc[1],[$b_ptr,#48] - stp @acc[2],@acc[3],[$b_ptr,#64] - stp @acc[4],@acc[5],[$b_ptr,#80] - - add $b_ptr,sp,#48 - bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) - ldr x30,[x29,#8] - - stp @a[0],@a[1],[$b_ptr] - stp @a[2],@a[3],[$b_ptr,#16] - stp @a[4],@a[5],[$b_ptr,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size sqr_mont_384x,.-sqr_mont_384x - -.globl mul_mont_384 -.hidden mul_mont_384 -.type mul_mont_384,%function -.align 5 -mul_mont_384: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there - - ldp @a[0],@a[1],[$a_ptr] - ldr $bi, [$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp @a[0],@a[1],[$b_ptr] - stp @a[2],@a[3],[$b_ptr,#16] - stp @a[4],@a[5],[$b_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size mul_mont_384,.-mul_mont_384 - -.type __mul_mont_384,%function -.align 5 -__mul_mont_384: - mul @acc[0],@a[0],$bi - mul @acc[1],@a[1],$bi - mul @acc[2],@a[2],$bi - mul @acc[3],@a[3],$bi - mul @acc[4],@a[4],$bi - mul @acc[5],@a[5],$bi - mul $n0,$n0,@acc[0] - - umulh @tmp[0],@a[0],$bi - umulh @tmp[1],@a[1],$bi - umulh @tmp[2],@a[2],$bi - umulh @tmp[3],@a[3],$bi - umulh @tmp[4],@a[4],$bi - umulh @tmp[5],@a[5],$bi - - adds @acc[1],@acc[1],@tmp[0] - // mul @tmp[0],@mod[0],$n0 - adcs @acc[2],@acc[2],@tmp[1] - mul @tmp[1],@mod[1],$n0 - adcs @acc[3],@acc[3],@tmp[2] - mul @tmp[2],@mod[2],$n0 - adcs @acc[4],@acc[4],@tmp[3] - mul @tmp[3],@mod[3],$n0 - adcs @acc[5],@acc[5],@tmp[4] - mul @tmp[4],@mod[4],$n0 - adc @acc[6],xzr, @tmp[5] - mul @tmp[5],@mod[5],$n0 - mov $bi,xzr -___ -for ($i=1;$i<6;$i++) { -$code.=<<___; - subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@mod[0],$n0 - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@mod[1],$n0 - adcs @acc[2],@acc[2],@tmp[2] - umulh @tmp[2],@mod[2],$n0 - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@mod[3],$n0 - adcs @acc[4],@acc[4],@tmp[4] - umulh @tmp[4],@mod[4],$n0 - adcs @acc[5],@acc[5],@tmp[5] - umulh @tmp[5],@mod[5],$n0 - adcs @acc[6],@acc[6],xzr - adc $n0,$bi,xzr - ldr $bi,[$b_ptr,8*$i] - - adds @acc[0],@acc[1],@tmp[0] - mul @tmp[0],@a[0],$bi - adcs @acc[1],@acc[2],@tmp[1] - mul @tmp[1],@a[1],$bi - adcs @acc[2],@acc[3],@tmp[2] - mul @tmp[2],@a[2],$bi - adcs @acc[3],@acc[4],@tmp[3] - mul @tmp[3],@a[3],$bi - adcs @acc[4],@acc[5],@tmp[4] - mul @tmp[4],@a[4],$bi - adcs @acc[5],@acc[6],@tmp[5] - mul @tmp[5],@a[5],$bi - adc @acc[6],$n0,xzr - ldr $n0,[x29,#96] - - adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@a[0],$bi - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@a[1],$bi - adcs @acc[2],@acc[2],@tmp[2] - mul $n0,$n0,@acc[0] - umulh @tmp[2],@a[2],$bi - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@a[3],$bi - adcs @acc[4],@acc[4],@tmp[4] - umulh @tmp[4],@a[4],$bi - adcs @acc[5],@acc[5],@tmp[5] - umulh @tmp[5],@a[5],$bi - adcs @acc[6],@acc[6],xzr - adc $bi,xzr,xzr - - adds @acc[1],@acc[1],@tmp[0] - // mul @tmp[0],@mod[0],$n0 - adcs @acc[2],@acc[2],@tmp[1] - mul @tmp[1],@mod[1],$n0 - adcs @acc[3],@acc[3],@tmp[2] - mul @tmp[2],@mod[2],$n0 - adcs @acc[4],@acc[4],@tmp[3] - mul @tmp[3],@mod[3],$n0 - adcs @acc[5],@acc[5],@tmp[4] - mul @tmp[4],@mod[4],$n0 - adcs @acc[6],@acc[6],@tmp[5] - mul @tmp[5],@mod[5],$n0 - adc $bi,$bi,xzr -___ -} -$code.=<<___; - subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@mod[0],$n0 - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@mod[1],$n0 - adcs @acc[2],@acc[2],@tmp[2] - umulh @tmp[2],@mod[2],$n0 - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@mod[3],$n0 - adcs @acc[4],@acc[4],@tmp[4] - umulh @tmp[4],@mod[4],$n0 - adcs @acc[5],@acc[5],@tmp[5] - umulh @tmp[5],@mod[5],$n0 - adcs @acc[6],@acc[6],xzr - ldp $n0,$b_ptr,[x29,#96] // pull r_ptr - adc $bi,$bi,xzr - - adds @acc[0],@acc[1],@tmp[0] - adcs @acc[1],@acc[2],@tmp[1] - adcs @acc[2],@acc[3],@tmp[2] - adcs @acc[3],@acc[4],@tmp[3] - adcs @acc[4],@acc[5],@tmp[4] - adcs @acc[5],@acc[6],@tmp[5] - adc @acc[6],$bi,xzr - - subs @tmp[0],@acc[0],@mod[0] - sbcs @tmp[1],@acc[1],@mod[1] - sbcs @tmp[2],@acc[2],@mod[2] - sbcs @tmp[3],@acc[3],@mod[3] - sbcs @tmp[4],@acc[4],@mod[4] - sbcs @tmp[5],@acc[5],@mod[5] - sbcs xzr, @acc[6],xzr - - csel @a[0],@acc[0],@tmp[0],lo - csel @a[1],@acc[1],@tmp[1],lo - csel @a[2],@acc[2],@tmp[2],lo - csel @a[3],@acc[3],@tmp[3],lo - csel @a[4],@acc[4],@tmp[4],lo - csel @a[5],@acc[5],@tmp[5],lo - ret -.size __mul_mont_384,.-__mul_mont_384 - -.globl sqr_mont_384 -.hidden sqr_mont_384 -.type sqr_mont_384,%function -.align 5 -sqr_mont_384: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for 768-bit vector - mov $n0,$n_ptr // adjust for missing b_ptr - - mov $n_ptr,$r_ptr // save r_ptr - mov $r_ptr,sp - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - bl __sqr_384 - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - mov $a_ptr,sp - mov $r_ptr,$n_ptr // restore r_ptr - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size sqr_mont_384,.-sqr_mont_384 - -.globl sqr_n_mul_mont_383 -.hidden sqr_n_mul_mont_383 -.type sqr_n_mul_mont_383,%function -.align 5 -sqr_n_mul_mont_383: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 768-bit vector - mov $bi,x5 // save b_ptr - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - mov $r_ptr,sp -.Loop_sqr_383: - bl __sqr_384 - sub $b_ptr,$b_ptr,#1 // counter - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - mov $a_ptr,sp - bl __mul_by_1_mont_384 - - ldp @acc[0],@acc[1],[$a_ptr,#48] - ldp @acc[2],@acc[3],[$a_ptr,#64] - ldp @acc[4],@acc[5],[$a_ptr,#80] - - adds @a[0],@a[0],@acc[0] // just accumulate upper half - adcs @a[1],@a[1],@acc[1] - adcs @a[2],@a[2],@acc[2] - adcs @a[3],@a[3],@acc[3] - adcs @a[4],@a[4],@acc[4] - adc @a[5],@a[5],@acc[5] - - cbnz $b_ptr,.Loop_sqr_383 - - mov $b_ptr,$bi - ldr $bi,[$bi] - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp @a[0],@a[1],[$b_ptr] - stp @a[2],@a[3],[$b_ptr,#16] - stp @a[4],@a[5],[$b_ptr,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 -___ -{ -my @acc=(@acc,@tmp[0..2]); - -$code.=<<___; -.type __sqr_384,%function -.align 5 -__sqr_384: - mul @acc[0],@a[1],@a[0] - mul @acc[1],@a[2],@a[0] - mul @acc[2],@a[3],@a[0] - mul @acc[3],@a[4],@a[0] - mul @acc[4],@a[5],@a[0] - - umulh @mod[1],@a[1],@a[0] - umulh @mod[2],@a[2],@a[0] - umulh @mod[3],@a[3],@a[0] - umulh @mod[4],@a[4],@a[0] - adds @acc[1],@acc[1],@mod[1] - umulh @mod[5],@a[5],@a[0] - adcs @acc[2],@acc[2],@mod[2] - mul @mod[2],@a[2],@a[1] - adcs @acc[3],@acc[3],@mod[3] - mul @mod[3],@a[3],@a[1] - adcs @acc[4],@acc[4],@mod[4] - mul @mod[4],@a[4],@a[1] - adc @acc[5],xzr, @mod[5] - mul @mod[5],@a[5],@a[1] - - adds @acc[2],@acc[2],@mod[2] - umulh @mod[2],@a[2],@a[1] - adcs @acc[3],@acc[3],@mod[3] - umulh @mod[3],@a[3],@a[1] - adcs @acc[4],@acc[4],@mod[4] - umulh @mod[4],@a[4],@a[1] - adcs @acc[5],@acc[5],@mod[5] - umulh @mod[5],@a[5],@a[1] - adc @acc[6],xzr,xzr - - mul @mod[0],@a[0],@a[0] - adds @acc[3],@acc[3],@mod[2] - umulh @a[0], @a[0],@a[0] - adcs @acc[4],@acc[4],@mod[3] - mul @mod[3],@a[3],@a[2] - adcs @acc[5],@acc[5],@mod[4] - mul @mod[4],@a[4],@a[2] - adc @acc[6],@acc[6],@mod[5] - mul @mod[5],@a[5],@a[2] - - adds @acc[4],@acc[4],@mod[3] - umulh @mod[3],@a[3],@a[2] - adcs @acc[5],@acc[5],@mod[4] - umulh @mod[4],@a[4],@a[2] - adcs @acc[6],@acc[6],@mod[5] - umulh @mod[5],@a[5],@a[2] - adc @acc[7],xzr,xzr - - mul @mod[1],@a[1],@a[1] - adds @acc[5],@acc[5],@mod[3] - umulh @a[1], @a[1],@a[1] - adcs @acc[6],@acc[6],@mod[4] - mul @mod[4],@a[4],@a[3] - adc @acc[7],@acc[7],@mod[5] - mul @mod[5],@a[5],@a[3] - - adds @acc[6],@acc[6],@mod[4] - umulh @mod[4],@a[4],@a[3] - adcs @acc[7],@acc[7],@mod[5] - umulh @mod[5],@a[5],@a[3] - adc @acc[8],xzr,xzr - mul @mod[2],@a[2],@a[2] - adds @acc[7],@acc[7],@mod[4] - umulh @a[2], @a[2],@a[2] - adc @acc[8],@acc[8],@mod[5] - mul @mod[3],@a[3],@a[3] - - mul @mod[5],@a[5],@a[4] - umulh @a[3], @a[3],@a[3] - adds @acc[8],@acc[8],@mod[5] - umulh @mod[5],@a[5],@a[4] - mul @mod[4],@a[4],@a[4] - adc @acc[9],@mod[5],xzr - - adds @acc[0],@acc[0],@acc[0] - adcs @acc[1],@acc[1],@acc[1] - adcs @acc[2],@acc[2],@acc[2] - adcs @acc[3],@acc[3],@acc[3] - adcs @acc[4],@acc[4],@acc[4] - adcs @acc[5],@acc[5],@acc[5] - adcs @acc[6],@acc[6],@acc[6] - adcs @acc[7],@acc[7],@acc[7] - umulh @a[4], @a[4],@a[4] - adcs @acc[8],@acc[8],@acc[8] - mul @mod[5],@a[5],@a[5] - adcs @acc[9],@acc[9],@acc[9] - umulh @a[5], @a[5],@a[5] - adc $a_ptr,xzr,xzr - - adds @acc[0],@acc[0],@a[0] - adcs @acc[1],@acc[1],@mod[1] - adcs @acc[2],@acc[2],@a[1] - adcs @acc[3],@acc[3],@mod[2] - adcs @acc[4],@acc[4],@a[2] - adcs @acc[5],@acc[5],@mod[3] - adcs @acc[6],@acc[6],@a[3] - stp @mod[0],@acc[0],[$r_ptr] - adcs @acc[7],@acc[7],@mod[4] - stp @acc[1],@acc[2],[$r_ptr,#16] - adcs @acc[8],@acc[8],@a[4] - stp @acc[3],@acc[4],[$r_ptr,#32] - adcs @acc[9],@acc[9],@mod[5] - stp @acc[5],@acc[6],[$r_ptr,#48] - adc @a[5],@a[5],$a_ptr - stp @acc[7],@acc[8],[$r_ptr,#64] - stp @acc[9],@a[5],[$r_ptr,#80] - - ret -.size __sqr_384,.-__sqr_384 -___ -} -$code.=<<___; -.globl sqr_384 -.hidden sqr_384 -.type sqr_384,%function -.align 5 -sqr_384: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - bl __sqr_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size sqr_384,.-sqr_384 - -.globl redc_mont_384 -.hidden redc_mont_384 -.type redc_mont_384,%function -.align 5 -redc_mont_384: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov $n0,$n_ptr // adjust for missing b_ptr - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size redc_mont_384,.-redc_mont_384 - -.globl from_mont_384 -.hidden from_mont_384 -.type from_mont_384,%function -.align 5 -from_mont_384: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov $n0,$n_ptr // adjust for missing b_ptr - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - subs @acc[0],@a[0],@mod[0] - sbcs @acc[1],@a[1],@mod[1] - sbcs @acc[2],@a[2],@mod[2] - sbcs @acc[3],@a[3],@mod[3] - sbcs @acc[4],@a[4],@mod[4] - sbcs @acc[5],@a[5],@mod[5] - - csel @a[0],@a[0],@acc[0],lo - csel @a[1],@a[1],@acc[1],lo - csel @a[2],@a[2],@acc[2],lo - csel @a[3],@a[3],@acc[3],lo - csel @a[4],@a[4],@acc[4],lo - csel @a[5],@a[5],@acc[5],lo - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size from_mont_384,.-from_mont_384 - -.type __mul_by_1_mont_384,%function -.align 5 -__mul_by_1_mont_384: - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - mul @tmp[0],$n0,@a[0] - ldp @a[4],@a[5],[$a_ptr,#32] - - // mul @acc[0],@mod[0],@tmp[0] - mul @acc[1],@mod[1],@tmp[0] - mul @acc[2],@mod[2],@tmp[0] - mul @acc[3],@mod[3],@tmp[0] - mul @acc[4],@mod[4],@tmp[0] - mul @acc[5],@mod[5],@tmp[0] - subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] - umulh @a[0],@mod[0],@tmp[0] - adcs @acc[1],@acc[1],@a[1] - umulh @a[1],@mod[1],@tmp[0] - adcs @acc[2],@acc[2],@a[2] - umulh @a[2],@mod[2],@tmp[0] - adcs @acc[3],@acc[3],@a[3] - umulh @a[3],@mod[3],@tmp[0] - adcs @acc[4],@acc[4],@a[4] - umulh @a[4],@mod[4],@tmp[0] - adcs @acc[5],@acc[5],@a[5] - umulh @a[5],@mod[5],@tmp[0] - adc @acc[6],xzr,xzr -___ -for ($i=1;$i<6;$i++) { -$code.=<<___; - adds @a[0],@a[0],@acc[1] - adcs @a[1],@a[1],@acc[2] - adcs @a[2],@a[2],@acc[3] - mul @tmp[0],$n0,@a[0] - adcs @a[3],@a[3],@acc[4] - adcs @a[4],@a[4],@acc[5] - adc @a[5],@a[5],@acc[6] - - // mul @acc[0],@mod[0],@tmp[0] - mul @acc[1],@mod[1],@tmp[0] - mul @acc[2],@mod[2],@tmp[0] - mul @acc[3],@mod[3],@tmp[0] - mul @acc[4],@mod[4],@tmp[0] - mul @acc[5],@mod[5],@tmp[0] - subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] - umulh @a[0],@mod[0],@tmp[0] - adcs @acc[1],@acc[1],@a[1] - umulh @a[1],@mod[1],@tmp[0] - adcs @acc[2],@acc[2],@a[2] - umulh @a[2],@mod[2],@tmp[0] - adcs @acc[3],@acc[3],@a[3] - umulh @a[3],@mod[3],@tmp[0] - adcs @acc[4],@acc[4],@a[4] - umulh @a[4],@mod[4],@tmp[0] - adcs @acc[5],@acc[5],@a[5] - umulh @a[5],@mod[5],@tmp[0] - adc @acc[6],xzr,xzr -___ -} -$code.=<<___; - adds @a[0],@a[0],@acc[1] - adcs @a[1],@a[1],@acc[2] - adcs @a[2],@a[2],@acc[3] - adcs @a[3],@a[3],@acc[4] - adcs @a[4],@a[4],@acc[5] - adc @a[5],@a[5],@acc[6] - - ret -.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 - -.type __redc_tail_mont_384,%function -.align 5 -__redc_tail_mont_384: - ldp @acc[0],@acc[1],[$a_ptr,#48] - ldp @acc[2],@acc[3],[$a_ptr,#64] - ldp @acc[4],@acc[5],[$a_ptr,#80] - - adds @a[0],@a[0],@acc[0] // accumulate upper half - adcs @a[1],@a[1],@acc[1] - adcs @a[2],@a[2],@acc[2] - adcs @a[3],@a[3],@acc[3] - adcs @a[4],@a[4],@acc[4] - adcs @a[5],@a[5],@acc[5] - adc @acc[6],xzr,xzr - - subs @acc[0],@a[0],@mod[0] - sbcs @acc[1],@a[1],@mod[1] - sbcs @acc[2],@a[2],@mod[2] - sbcs @acc[3],@a[3],@mod[3] - sbcs @acc[4],@a[4],@mod[4] - sbcs @acc[5],@a[5],@mod[5] - sbcs xzr,@acc[6],xzr - - csel @a[0],@a[0],@acc[0],lo - csel @a[1],@a[1],@acc[1],lo - csel @a[2],@a[2],@acc[2],lo - csel @a[3],@a[3],@acc[3],lo - csel @a[4],@a[4],@acc[4],lo - csel @a[5],@a[5],@acc[5],lo - - stp @a[0],@a[1],[$r_ptr] - stp @a[2],@a[3],[$r_ptr,#16] - stp @a[4],@a[5],[$r_ptr,#32] - - ret -.size __redc_tail_mont_384,.-__redc_tail_mont_384 - -.globl mul_384 -.hidden mul_384 -.type mul_384,%function -.align 5 -mul_384: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - bl __mul_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size mul_384,.-mul_384 - -.type __mul_384,%function -.align 5 -__mul_384: - ldp @a[0],@a[1],[$a_ptr] - ldr $bi, [$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - mul @acc[0],@a[0],$bi - mul @acc[1],@a[1],$bi - mul @acc[2],@a[2],$bi - mul @acc[3],@a[3],$bi - mul @acc[4],@a[4],$bi - mul @acc[5],@a[5],$bi - - umulh @mod[0],@a[0],$bi - umulh @mod[1],@a[1],$bi - umulh @mod[2],@a[2],$bi - umulh @mod[3],@a[3],$bi - umulh @mod[4],@a[4],$bi - umulh @mod[5],@a[5],$bi - ldr $bi,[$b_ptr,8*1] - - str @acc[0],[$r_ptr] - adds @acc[0],@acc[1],@mod[0] - mul @mod[0],@a[0],$bi - adcs @acc[1],@acc[2],@mod[1] - mul @mod[1],@a[1],$bi - adcs @acc[2],@acc[3],@mod[2] - mul @mod[2],@a[2],$bi - adcs @acc[3],@acc[4],@mod[3] - mul @mod[3],@a[3],$bi - adcs @acc[4],@acc[5],@mod[4] - mul @mod[4],@a[4],$bi - adc @acc[5],xzr, @mod[5] - mul @mod[5],@a[5],$bi -___ -for ($i=1;$i<5;$i++) { -$code.=<<___; - adds @acc[0],@acc[0],@mod[0] - umulh @mod[0],@a[0],$bi - adcs @acc[1],@acc[1],@mod[1] - umulh @mod[1],@a[1],$bi - adcs @acc[2],@acc[2],@mod[2] - umulh @mod[2],@a[2],$bi - adcs @acc[3],@acc[3],@mod[3] - umulh @mod[3],@a[3],$bi - adcs @acc[4],@acc[4],@mod[4] - umulh @mod[4],@a[4],$bi - adcs @acc[5],@acc[5],@mod[5] - umulh @mod[5],@a[5],$bi - ldr $bi,[$b_ptr,#8*($i+1)] - adc @acc[6],xzr,xzr - - str @acc[0],[$r_ptr,8*$i] - adds @acc[0],@acc[1],@mod[0] - mul @mod[0],@a[0],$bi - adcs @acc[1],@acc[2],@mod[1] - mul @mod[1],@a[1],$bi - adcs @acc[2],@acc[3],@mod[2] - mul @mod[2],@a[2],$bi - adcs @acc[3],@acc[4],@mod[3] - mul @mod[3],@a[3],$bi - adcs @acc[4],@acc[5],@mod[4] - mul @mod[4],@a[4],$bi - adc @acc[5],@acc[6],@mod[5] - mul @mod[5],@a[5],$bi -___ -} -$code.=<<___; - adds @acc[0],@acc[0],@mod[0] - umulh @mod[0],@a[0],$bi - adcs @acc[1],@acc[1],@mod[1] - umulh @mod[1],@a[1],$bi - adcs @acc[2],@acc[2],@mod[2] - umulh @mod[2],@a[2],$bi - adcs @acc[3],@acc[3],@mod[3] - umulh @mod[3],@a[3],$bi - adcs @acc[4],@acc[4],@mod[4] - umulh @mod[4],@a[4],$bi - adcs @acc[5],@acc[5],@mod[5] - umulh @mod[5],@a[5],$bi - adc @acc[6],xzr,xzr - - str @acc[0],[$r_ptr,8*$i] - adds @acc[0],@acc[1],@mod[0] - adcs @acc[1],@acc[2],@mod[1] - adcs @acc[2],@acc[3],@mod[2] - adcs @acc[3],@acc[4],@mod[3] - adcs @acc[4],@acc[5],@mod[4] - adc @acc[5],@acc[6],@mod[5] - - stp @acc[0],@acc[1],[$r_ptr,#48] - stp @acc[2],@acc[3],[$r_ptr,#64] - stp @acc[4],@acc[5],[$r_ptr,#80] - - ret -.size __mul_384,.-__mul_384 - -.globl mul_382x -.hidden mul_382x -.type mul_382x,%function -.align 5 -mul_382x: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for two 384-bit vectors - - ldp @a[0],@a[1],[$a_ptr] - mov @tmp[0],$r_ptr // save r_ptr - ldp @acc[0],@acc[1],[$a_ptr,#48] - mov @tmp[1],$a_ptr // save a_ptr - ldp @a[2],@a[3],[$a_ptr,#16] - mov @tmp[2],$b_ptr // save b_ptr - ldp @acc[2],@acc[3],[$a_ptr,#64] - ldp @a[4],@a[5],[$a_ptr,#32] - adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im - ldp @acc[4],@acc[5],[$a_ptr,#80] - adcs @mod[1],$a[1],@acc[1] - ldp @a[0],@a[1],[$b_ptr] - adcs @mod[2],$a[2],@acc[2] - ldp @acc[0],@acc[1],[$b_ptr,#48] - adcs @mod[3],$a[3],@acc[3] - ldp @a[2],@a[3],[$b_ptr,#16] - adcs @mod[4],$a[4],@acc[4] - ldp @acc[2],@acc[3],[$b_ptr,#64] - adc @mod[5],$a[5],@acc[5] - ldp @a[4],@a[5],[$b_ptr,#32] - - stp @mod[0],@mod[1],[sp] - adds @mod[0],$a[0],@acc[0] // t1 = b->re + b->im - ldp @acc[4],@acc[5],[$b_ptr,#80] - adcs @mod[1],$a[1],@acc[1] - stp @mod[2],@mod[3],[sp,#16] - adcs @mod[2],$a[2],@acc[2] - adcs @mod[3],$a[3],@acc[3] - stp @mod[4],@mod[5],[sp,#32] - adcs @mod[4],$a[4],@acc[4] - stp @mod[0],@mod[1],[sp,#48] - adc @mod[5],$a[5],@acc[5] - stp @mod[2],@mod[3],[sp,#64] - stp @mod[4],@mod[5],[sp,#80] - - bl __mul_384 // mul_384(ret->re, a->re, b->re) - - add $a_ptr,sp,#0 // mul_384(ret->im, t0, t1) - add $b_ptr,sp,#48 - add $r_ptr,@tmp[0],#96 - bl __mul_384 - - add $a_ptr,@tmp[1],#48 // mul_384(tx, a->im, b->im) - add $b_ptr,@tmp[2],#48 - add $r_ptr,sp,#0 - bl __mul_384 - - ldp @mod[0],@mod[1],[$n_ptr] - ldp @mod[2],@mod[3],[$n_ptr,#16] - ldp @mod[4],@mod[5],[$n_ptr,#32] - - add $a_ptr,@tmp[0],#96 // ret->im -= tx - add $b_ptr,sp,#0 - add $r_ptr,@tmp[0],#96 - bl __sub_mod_384x384 - - add $b_ptr,@tmp[0],#0 // ret->im -= ret->re - bl __sub_mod_384x384 - - add $a_ptr,@tmp[0],#0 // ret->re -= tx - add $b_ptr,sp,#0 - add $r_ptr,@tmp[0],#0 - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size mul_382x,.-mul_382x - -.globl sqr_382x -.hidden sqr_382x -.type sqr_382x,%function -.align 5 -sqr_382x: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp @a[0],@a[1],[$a_ptr] - ldp @acc[0],@acc[1],[$a_ptr,#48] - ldp @a[2],@a[3],[$a_ptr,#16] - adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im - ldp @acc[2],@acc[3],[$a_ptr,#64] - adcs @mod[1],$a[1],@acc[1] - ldp @a[4],@a[5],[$a_ptr,#32] - adcs @mod[2],$a[2],@acc[2] - ldp @acc[4],@acc[5],[$a_ptr,#80] - adcs @mod[3],$a[3],@acc[3] - stp @mod[0],@mod[1],[$r_ptr] - adcs @mod[4],$a[4],@acc[4] - ldp @mod[0],@mod[1],[$b_ptr] - adc @mod[5],$a[5],@acc[5] - stp @mod[2],@mod[3],[$r_ptr,#16] - - subs @a[0],$a[0],@acc[0] // t1 = a->re - a->im - ldp @mod[2],@mod[3],[$b_ptr,#16] - sbcs @a[1],$a[1],@acc[1] - stp @mod[4],@mod[5],[$r_ptr,#32] - sbcs @a[2],$a[2],@acc[2] - ldp @mod[4],@mod[5],[$b_ptr,#32] - sbcs @a[3],$a[3],@acc[3] - sbcs @a[4],$a[4],@acc[4] - sbcs @a[5],$a[5],@acc[5] - sbc @acc[6],xzr,xzr - - and @acc[0],@mod[0],@acc[6] - and @acc[1],@mod[1],@acc[6] - adds @a[0],@a[0],@acc[0] - and @acc[2],@mod[2],@acc[6] - adcs @a[1],@a[1],@acc[1] - and @acc[3],@mod[3],@acc[6] - adcs @a[2],@a[2],@acc[2] - and @acc[4],@mod[4],@acc[6] - adcs @a[3],@a[3],@acc[3] - and @acc[5],@mod[5],@acc[6] - adcs @a[4],@a[4],@acc[4] - stp @a[0],@a[1],[$r_ptr,#48] - adc @a[5],@a[5],@acc[5] - stp @a[2],@a[3],[$r_ptr,#64] - stp @a[4],@a[5],[$r_ptr,#80] - - mov $n0,$a_ptr // save a_ptr - add $a_ptr,$r_ptr,#0 // mul_384(ret->re, t0, t1) - add $b_ptr,$r_ptr,#48 - bl __mul_384 - - add $a_ptr,$n0,#0 // mul_384(ret->im, a->re, a->im) - add $b_ptr,$n0,#48 - add $r_ptr,$r_ptr,#96 - bl __mul_384 - ldr x30,[x29,#8] - - ldp @a[0],@a[1],[$r_ptr] - ldp @a[2],@a[3],[$r_ptr,#16] - adds @a[0],@a[0],@a[0] // add with itself - ldp @a[4],@a[5],[$r_ptr,#32] - adcs @a[1],@a[1],@a[1] - adcs @a[2],@a[2],@a[2] - adcs @a[3],@a[3],@a[3] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adcs @acc[0],@acc[0],@acc[0] - adcs @acc[1],@acc[1],@acc[1] - stp @a[0],@a[1],[$r_ptr] - adcs @acc[2],@acc[2],@acc[2] - stp @a[2],@a[3],[$r_ptr,#16] - adcs @acc[3],@acc[3],@acc[3] - stp @a[4],@a[5],[$r_ptr,#32] - adcs @acc[4],@acc[4],@acc[4] - stp @acc[0],@acc[1],[$r_ptr,#48] - adc @acc[5],@acc[5],@acc[5] - stp @acc[2],@acc[3],[$r_ptr,#64] - stp @acc[4],@acc[5],[$r_ptr,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size sqr_382x,.-sqr_382x - -.globl sqr_mont_382x -.hidden sqr_mont_382x -.type sqr_mont_382x,%function -.align 5 -sqr_mont_382x: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#112 // space for two 384-bit vectors + word - mov $n0,$n_ptr // adjust for missing b_ptr - - ldp @a[0],@a[1],[$a_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - - ldp $bi,@acc[1],[$a_ptr,#48] - ldp @acc[2],@acc[3],[$a_ptr,#64] - ldp @acc[4],@acc[5],[$a_ptr,#80] - - adds @mod[0],$a[0],$bi // t0 = a->re + a->im - adcs @mod[1],$a[1],@acc[1] - adcs @mod[2],$a[2],@acc[2] - adcs @mod[3],$a[3],@acc[3] - adcs @mod[4],$a[4],@acc[4] - adc @mod[5],$a[5],@acc[5] - - subs @acc[0],$a[0],$bi // t1 = a->re - a->im - sbcs @acc[1],$a[1],@acc[1] - sbcs @acc[2],$a[2],@acc[2] - sbcs @acc[3],$a[3],@acc[3] - sbcs @acc[4],$a[4],@acc[4] - sbcs @acc[5],$a[5],@acc[5] - sbc @acc[6],xzr,xzr // borrow flag as mask - - stp @mod[0],@mod[1],[sp] - stp @mod[2],@mod[3],[sp,#16] - stp @mod[4],@mod[5],[sp,#32] - stp @acc[0],@acc[1],[sp,#48] - stp @acc[2],@acc[3],[sp,#64] - stp @acc[4],@acc[5],[sp,#80] - str @acc[6],[sp,#96] - - ldp @mod[0],@mod[1],[$b_ptr] - ldp @mod[2],@mod[3],[$b_ptr,#16] - ldp @mod[4],@mod[5],[$b_ptr,#32] - - add $b_ptr,$a_ptr,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) - - adds @acc[0],@a[0],@a[0] // add with itself - adcs @acc[1],@a[1],@a[1] - adcs @acc[2],@a[2],@a[2] - adcs @acc[3],@a[3],@a[3] - adcs @acc[4],@a[4],@a[4] - adc @acc[5],@a[5],@a[5] - - stp @acc[0],@acc[1],[$b_ptr,#48] - stp @acc[2],@acc[3],[$b_ptr,#64] - stp @acc[4],@acc[5],[$b_ptr,#80] - - ldp @a[0],@a[1],[sp] - ldr $bi,[sp,#48] - ldp @a[2],@a[3],[sp,#16] - ldp @a[4],@a[5],[sp,#32] - - add $b_ptr,sp,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) - ldr x30,[x29,#8] - - ldr @acc[6],[sp,#96] // account for sign from a->re - a->im - ldp @acc[0],@acc[1],[sp] - ldp @acc[2],@acc[3],[sp,#16] - ldp @acc[4],@acc[5],[sp,#32] - - and @acc[0],@acc[0],@acc[6] - and @acc[1],@acc[1],@acc[6] - and @acc[2],@acc[2],@acc[6] - and @acc[3],@acc[3],@acc[6] - and @acc[4],@acc[4],@acc[6] - and @acc[5],@acc[5],@acc[6] - - subs @a[0],@a[0],@acc[0] - sbcs @a[1],@a[1],@acc[1] - sbcs @a[2],@a[2],@acc[2] - sbcs @a[3],@a[3],@acc[3] - sbcs @a[4],@a[4],@acc[4] - sbcs @a[5],@a[5],@acc[5] - sbc @acc[6],xzr,xzr - - and @acc[0],@mod[0],@acc[6] - and @acc[1],@mod[1],@acc[6] - and @acc[2],@mod[2],@acc[6] - and @acc[3],@mod[3],@acc[6] - and @acc[4],@mod[4],@acc[6] - and @acc[5],@mod[5],@acc[6] - - adds @a[0],@a[0],@acc[0] - adcs @a[1],@a[1],@acc[1] - adcs @a[2],@a[2],@acc[2] - adcs @a[3],@a[3],@acc[3] - adcs @a[4],@a[4],@acc[4] - adc @a[5],@a[5],@acc[5] - - stp @a[0],@a[1],[$b_ptr] - stp @a[2],@a[3],[$b_ptr,#16] - stp @a[4],@a[5],[$b_ptr,#32] - - add sp,sp,#112 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size sqr_mont_382x,.-sqr_mont_382x - -.type __mul_mont_383_nonred,%function -.align 5 -__mul_mont_383_nonred: - mul @acc[0],@a[0],$bi - mul @acc[1],@a[1],$bi - mul @acc[2],@a[2],$bi - mul @acc[3],@a[3],$bi - mul @acc[4],@a[4],$bi - mul @acc[5],@a[5],$bi - mul $n0,$n0,@acc[0] - - umulh @tmp[0],@a[0],$bi - umulh @tmp[1],@a[1],$bi - umulh @tmp[2],@a[2],$bi - umulh @tmp[3],@a[3],$bi - umulh @tmp[4],@a[4],$bi - umulh @tmp[5],@a[5],$bi - - adds @acc[1],@acc[1],@tmp[0] - mul @tmp[0],@mod[0],$n0 - adcs @acc[2],@acc[2],@tmp[1] - mul @tmp[1],@mod[1],$n0 - adcs @acc[3],@acc[3],@tmp[2] - mul @tmp[2],@mod[2],$n0 - adcs @acc[4],@acc[4],@tmp[3] - mul @tmp[3],@mod[3],$n0 - adcs @acc[5],@acc[5],@tmp[4] - mul @tmp[4],@mod[4],$n0 - adc @acc[6],xzr, @tmp[5] - mul @tmp[5],@mod[5],$n0 -___ -for ($i=1;$i<6;$i++) { -$code.=<<___; - ldr $bi,[$b_ptr,8*$i] - adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@mod[0],$n0 - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@mod[1],$n0 - adcs @acc[2],@acc[2],@tmp[2] - umulh @tmp[2],@mod[2],$n0 - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@mod[3],$n0 - adcs @acc[4],@acc[4],@tmp[4] - umulh @tmp[4],@mod[4],$n0 - adcs @acc[5],@acc[5],@tmp[5] - umulh @tmp[5],@mod[5],$n0 - adc @acc[6],@acc[6],xzr - - ldr $n0,[x29,#96] - adds @acc[0],@acc[1],@tmp[0] - mul @tmp[0],@a[0],$bi - adcs @acc[1],@acc[2],@tmp[1] - mul @tmp[1],@a[1],$bi - adcs @acc[2],@acc[3],@tmp[2] - mul @tmp[2],@a[2],$bi - adcs @acc[3],@acc[4],@tmp[3] - mul @tmp[3],@a[3],$bi - adcs @acc[4],@acc[5],@tmp[4] - mul @tmp[4],@a[4],$bi - adcs @acc[5],@acc[6],@tmp[5] - mul @tmp[5],@a[5],$bi - adc @acc[6],xzr,xzr - - adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@a[0],$bi - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@a[1],$bi - adcs @acc[2],@acc[2],@tmp[2] - mul $n0,$n0,@acc[0] - umulh @tmp[2],@a[2],$bi - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@a[3],$bi - adcs @acc[4],@acc[4],@tmp[4] - umulh @tmp[4],@a[4],$bi - adcs @acc[5],@acc[5],@tmp[5] - umulh @tmp[5],@a[5],$bi - adc @acc[6],@acc[6],xzr - - adds @acc[1],@acc[1],@tmp[0] - mul @tmp[0],@mod[0],$n0 - adcs @acc[2],@acc[2],@tmp[1] - mul @tmp[1],@mod[1],$n0 - adcs @acc[3],@acc[3],@tmp[2] - mul @tmp[2],@mod[2],$n0 - adcs @acc[4],@acc[4],@tmp[3] - mul @tmp[3],@mod[3],$n0 - adcs @acc[5],@acc[5],@tmp[4] - mul @tmp[4],@mod[4],$n0 - adc @acc[6],@acc[6],@tmp[5] - mul @tmp[5],@mod[5],$n0 -___ -} -$code.=<<___; - adds @acc[0],@acc[0],@tmp[0] - umulh @tmp[0],@mod[0],$n0 - adcs @acc[1],@acc[1],@tmp[1] - umulh @tmp[1],@mod[1],$n0 - adcs @acc[2],@acc[2],@tmp[2] - umulh @tmp[2],@mod[2],$n0 - adcs @acc[3],@acc[3],@tmp[3] - umulh @tmp[3],@mod[3],$n0 - adcs @acc[4],@acc[4],@tmp[4] - umulh @tmp[4],@mod[4],$n0 - adcs @acc[5],@acc[5],@tmp[5] - umulh @tmp[5],@mod[5],$n0 - adc @acc[6],@acc[6],xzr - ldp $n0,$b_ptr,[x29,#96] // pull r_ptr - - adds @a[0],@acc[1],@tmp[0] - adcs @a[1],@acc[2],@tmp[1] - adcs @a[2],@acc[3],@tmp[2] - adcs @a[3],@acc[4],@tmp[3] - adcs @a[4],@acc[5],@tmp[4] - adcs @a[5],@acc[6],@tmp[5] - - ret -.size __mul_mont_383_nonred,.-__mul_mont_383_nonred - -.globl sgn0_pty_mont_384 -.hidden sgn0_pty_mont_384 -.type sgn0_pty_mont_384,%function -.align 5 -sgn0_pty_mont_384: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov $n0,$b_ptr - ldp @mod[0],@mod[1],[$a_ptr] - ldp @mod[2],@mod[3],[$a_ptr,#16] - ldp @mod[4],@mod[5],[$a_ptr,#32] - mov $a_ptr,$r_ptr - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and $r_ptr,@a[0],#1 - adds @a[0],@a[0],@a[0] - adcs @a[1],@a[1],@a[1] - adcs @a[2],@a[2],@a[2] - adcs @a[3],@a[3],@a[3] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adc $bi,xzr,xzr - - subs @a[0],@a[0],@mod[0] - sbcs @a[1],@a[1],@mod[1] - sbcs @a[2],@a[2],@mod[2] - sbcs @a[3],@a[3],@mod[3] - sbcs @a[4],@a[4],@mod[4] - sbcs @a[5],@a[5],@mod[5] - sbc $bi,$bi,xzr - - mvn $bi,$bi - and $bi,$bi,#2 - orr $r_ptr,$r_ptr,$bi - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 - -.globl sgn0_pty_mont_384x -.hidden sgn0_pty_mont_384x -.type sgn0_pty_mont_384x,%function -.align 5 -sgn0_pty_mont_384x: - paciasp - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov $n0,$b_ptr - ldp @mod[0],@mod[1],[$a_ptr] - ldp @mod[2],@mod[3],[$a_ptr,#16] - ldp @mod[4],@mod[5],[$a_ptr,#32] - mov $a_ptr,$r_ptr - - bl __mul_by_1_mont_384 - add $a_ptr,$a_ptr,#48 - - and $b_ptr,@a[0],#1 - orr $n_ptr,@a[0],@a[1] - adds @a[0],@a[0],@a[0] - orr $n_ptr,$n_ptr,@a[2] - adcs @a[1],@a[1],@a[1] - orr $n_ptr,$n_ptr,@a[3] - adcs @a[2],@a[2],@a[2] - orr $n_ptr,$n_ptr,@a[4] - adcs @a[3],@a[3],@a[3] - orr $n_ptr,$n_ptr,@a[5] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adc $bi,xzr,xzr - - subs @a[0],@a[0],@mod[0] - sbcs @a[1],@a[1],@mod[1] - sbcs @a[2],@a[2],@mod[2] - sbcs @a[3],@a[3],@mod[3] - sbcs @a[4],@a[4],@mod[4] - sbcs @a[5],@a[5],@mod[5] - sbc $bi,$bi,xzr - - mvn $bi,$bi - and $bi,$bi,#2 - orr $b_ptr,$b_ptr,$bi - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and $r_ptr,@a[0],#1 - orr $a_ptr,@a[0],@a[1] - adds @a[0],@a[0],@a[0] - orr $a_ptr,$a_ptr,@a[2] - adcs @a[1],@a[1],@a[1] - orr $a_ptr,$a_ptr,@a[3] - adcs @a[2],@a[2],@a[2] - orr $a_ptr,$a_ptr,@a[4] - adcs @a[3],@a[3],@a[3] - orr $a_ptr,$a_ptr,@a[5] - adcs @a[4],@a[4],@a[4] - adcs @a[5],@a[5],@a[5] - adc $bi,xzr,xzr - - subs @a[0],@a[0],@mod[0] - sbcs @a[1],@a[1],@mod[1] - sbcs @a[2],@a[2],@mod[2] - sbcs @a[3],@a[3],@mod[3] - sbcs @a[4],@a[4],@mod[4] - sbcs @a[5],@a[5],@mod[5] - sbc $bi,$bi,xzr - - mvn $bi,$bi - and $bi,$bi,#2 - orr $r_ptr,$r_ptr,$bi - - cmp $n_ptr,#0 - csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) - - cmp $a_ptr,#0 - csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) - - and $n_ptr,$n_ptr,#1 - and $a_ptr,$a_ptr,#2 - orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - autiasp - ret -.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x -___ - -if (0) { -my @b = ($bi, @mod[0..4]); -my @comba = @acc[4..6]; - -$code.=<<___; -.type __mul_384_comba,%function -.align 5 -__mul_384_comba: - ldp @a[0],@a[1],[$a_ptr] - ldp @b[0],@b[1],[$b_ptr] - ldp @a[2],@a[3],[$a_ptr,#16] - ldp @a[4],@a[5],[$a_ptr,#32] - ldp @b[2],@b[3],[$b_ptr,#16] - ldp @b[4],@b[5],[$b_ptr,#32] - - mul @comba[0],@a[0],@b[0] - umulh @comba[1],@a[0],@b[0] - mul @acc[0],@a[1],@b[0] - umulh @acc[1],@a[1],@b[0] - str @comba[0],[$r_ptr] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[2],@a[0],@b[1] - umulh @acc[3],@a[0],@b[1] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],xzr, @acc[1] - adc @comba[2],xzr,xzr - mul @acc[0],@a[2],@b[0] - umulh @acc[1],@a[2],@b[0] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#8] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[2],@a[1],@b[1] - umulh @acc[3],@a[1],@b[1] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],xzr,xzr - mul @acc[0],@a[0],@b[2] - umulh @acc[1],@a[0],@b[2] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - mul @acc[2],@a[3],@b[0] - umulh @acc[3],@a[3],@b[0] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#16] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[0],@a[2],@b[1] - umulh @acc[1],@a[2],@b[1] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],xzr,xzr - mul @acc[2],@a[1],@b[2] - umulh @acc[3],@a[1],@b[2] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - mul @acc[0],@a[0],@b[3] - umulh @acc[1],@a[0],@b[3] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - mul @acc[2],@a[4],@b[0] - umulh @acc[3],@a[4],@b[0] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#24] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[0],@a[3],@b[1] - umulh @acc[1],@a[3],@b[1] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],xzr,xzr - mul @acc[2],@a[2],@b[2] - umulh @acc[3],@a[2],@b[2] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - mul @acc[0],@a[1],@b[3] - umulh @acc[1],@a[1],@b[3] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - mul @acc[2],@a[0],@b[4] - umulh @acc[3],@a[0],@b[4] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - mul @acc[0],@a[5],@b[0] - umulh @acc[1],@a[5],@b[0] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#32] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[2],@a[4],@b[1] - umulh @acc[3],@a[4],@b[1] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],xzr,xzr - mul @acc[0],@a[3],@b[2] - umulh @acc[1],@a[3],@b[2] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - mul @acc[2],@a[2],@b[3] - umulh @acc[3],@a[2],@b[3] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - mul @acc[0],@a[1],@b[4] - umulh @acc[1],@a[1],@b[4] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - mul @acc[2],@a[0],@b[5] - umulh @acc[3],@a[0],@b[5] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - mul @acc[0],@a[5],@b[1] - umulh @acc[1],@a[5],@b[1] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#40] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[2],@a[4],@b[2] - umulh @acc[3],@a[4],@b[2] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],xzr,xzr - mul @acc[0],@a[3],@b[3] - umulh @acc[1],@a[3],@b[3] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - mul @acc[2],@a[2],@b[4] - umulh @acc[3],@a[2],@b[4] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - mul @acc[0],@a[1],@b[5] - umulh @acc[1],@a[1],@b[5] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - mul @acc[2],@a[5],@b[2] - umulh @acc[3],@a[5],@b[2] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#48] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[0],@a[4],@b[3] - umulh @acc[1],@a[4],@b[3] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],xzr,xzr - mul @acc[2],@a[3],@b[4] - umulh @acc[3],@a[3],@b[4] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - mul @acc[0],@a[2],@b[5] - umulh @acc[1],@a[2],@b[5] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - mul @acc[2],@a[5],@b[3] - umulh @acc[3],@a[5],@b[3] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#56] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[0],@a[4],@b[4] - umulh @acc[1],@a[4],@b[4] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],xzr,xzr - mul @acc[2],@a[3],@b[5] - umulh @acc[3],@a[3],@b[5] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],@comba[2],xzr - mul @acc[0],@a[5],@b[4] - umulh @acc[1],@a[5],@b[4] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#64] -___ - push(@comba,shift(@comba)); -$code.=<<___; - mul @acc[2],@a[4],@b[5] - umulh @acc[3],@a[4],@b[5] - adds @comba[0],@comba[0],@acc[0] - adcs @comba[1],@comba[1],@acc[1] - adc @comba[2],xzr,xzr - mul @acc[0],@a[5],@b[5] - umulh @acc[1],@a[5],@b[5] - adds @comba[0],@comba[0],@acc[2] - adcs @comba[1],@comba[1],@acc[3] - adc @comba[2],@comba[2],xzr - str @comba[0],[$r_ptr,#72] -___ - push(@comba,shift(@comba)); -$code.=<<___; - adds @comba[0],@comba[0],@acc[0] - adc @comba[1],@comba[1],@acc[1] - stp @comba[0],@comba[1],[$r_ptr,#80] - - ret -.size __mul_384_comba,.-__mul_384_comba -___ -} -print $code; - -close STDOUT; diff --git a/crypto/blst_src/asm/mulq_mont_256-x86_64.pl b/crypto/blst_src/asm/mulq_mont_256-x86_64.pl deleted file mode 100755 index 12e58bb001e..00000000000 --- a/crypto/blst_src/asm/mulq_mont_256-x86_64.pl +++ /dev/null @@ -1,513 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# As for "sparse" in subroutine names, see commentary in the -# asm/mulx_mont_256-x86_64.pl module. - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -# common argument layout -($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); -$b_ptr = "%rbx"; - -{ ############################################################## 256 bits -my @acc=map("%r$_",(9..15)); - -{ ############################################################## mulq -my ($hi, $a0) = ("%rbp", $r_ptr); - -$code.=<<___; -.text - -.globl mul_mont_sparse_256 -.hidden mul_mont_sparse_256 -.type mul_mont_sparse_256,\@function,5,"unwind" -.align 32 -mul_mont_sparse_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $r_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($b_org), %rax - mov 8*0($a_ptr), @acc[4] - mov 8*1($a_ptr), @acc[5] - mov 8*2($a_ptr), @acc[3] - mov 8*3($a_ptr), $hi - mov $b_org, $b_ptr # evacuate from %rdx - - mov %rax, @acc[6] - mulq @acc[4] # a[0]*b[0] - mov %rax, @acc[0] - mov @acc[6], %rax - mov %rdx, @acc[1] - call __mulq_mont_sparse_256 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size mul_mont_sparse_256,.-mul_mont_sparse_256 - -.globl sqr_mont_sparse_256 -.hidden sqr_mont_sparse_256 -.type sqr_mont_sparse_256,\@function,4,"unwind" -.align 32 -sqr_mont_sparse_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $r_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov 8*0($a_ptr), %rax - mov $n_ptr, $n0 - mov 8*1($a_ptr), @acc[5] - mov $b_org, $n_ptr - mov 8*2($a_ptr), @acc[3] - lea ($a_ptr), $b_ptr - mov 8*3($a_ptr), $hi - - mov %rax, @acc[6] - mulq %rax # a[0]*a[0] - mov %rax, @acc[0] - mov @acc[6], %rax - mov %rdx, @acc[1] - call __mulq_mont_sparse_256 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 -___ -{ -my @acc=@acc; -$code.=<<___; -.type __mulq_mont_sparse_256,\@abi-omnipotent -.align 32 -__mulq_mont_sparse_256: - mulq @acc[5] # a[1]*b[0] - add %rax, @acc[1] - mov @acc[6], %rax - adc \$0, %rdx - mov %rdx, @acc[2] - - mulq @acc[3] # a[2]*b[0] - add %rax, @acc[2] - mov @acc[6], %rax - adc \$0, %rdx - mov %rdx, @acc[3] - - mulq $hi # a[3]*b[0] - add %rax, @acc[3] - mov 8($b_ptr), %rax - adc \$0, %rdx - xor @acc[5], @acc[5] - mov %rdx, @acc[4] - -___ -for (my $i=1; $i<4; $i++) { -my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1]; -$code.=<<___; - mov @acc[0], $a0 - imulq $n0, @acc[0] - - ################################# Multiply by b[$i] - mov %rax, @acc[6] - mulq 8*0($a_ptr) - add %rax, @acc[1] - mov @acc[6], %rax - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*1($a_ptr) - add %rax, @acc[2] - mov @acc[6], %rax - adc \$0, %rdx - add $hi, @acc[2] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*2($a_ptr) - add %rax, @acc[3] - mov @acc[6], %rax - adc \$0, %rdx - add $hi, @acc[3] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*3($a_ptr) - add %rax, @acc[4] - mov @acc[0], %rax - adc \$0, %rdx - add $hi, @acc[4] - adc %rdx, @acc[5] # can't overflow - xor @acc[6], @acc[6] - - ################################# reduction - mulq 8*0($n_ptr) - add %rax, $a0 # guaranteed to be zero - mov @acc[0], %rax - adc %rdx, $a0 - - mulq 8*1($n_ptr) - add %rax, @acc[1] - mov @acc[0], %rax - adc \$0, %rdx - add $a0, @acc[1] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*2($n_ptr) - add %rax, @acc[2] - mov @acc[0], %rax - adc \$0, %rdx - add $hi, @acc[2] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*3($n_ptr) - add %rax, @acc[3] - mov $b_next, %rax - adc \$0, %rdx - add $hi, @acc[3] - adc \$0, %rdx - add %rdx, @acc[4] - adc \$0, @acc[5] - adc \$0, @acc[6] -___ - push(@acc,shift(@acc)); -} -$code.=<<___; - imulq $n0, %rax - mov 8(%rsp), $a_ptr # restore $r_ptr - - ################################# last reduction - mov %rax, @acc[6] - mulq 8*0($n_ptr) - add %rax, @acc[0] # guaranteed to be zero - mov @acc[6], %rax - adc %rdx, @acc[0] - - mulq 8*1($n_ptr) - add %rax, @acc[1] - mov @acc[6], %rax - adc \$0, %rdx - add @acc[0], @acc[1] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*2($n_ptr) - add %rax, @acc[2] - mov @acc[6], %rax - adc \$0, %rdx - add $hi, @acc[2] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*3($n_ptr) - mov @acc[2], $b_ptr - add $hi, @acc[3] - adc \$0, %rdx - add %rax, @acc[3] - mov @acc[1], %rax - adc \$0, %rdx - add %rdx, @acc[4] - adc \$0, @acc[5] - - ################################# - # Branch-less conditional subtraction of modulus - - mov @acc[3], @acc[0] - sub 8*0($n_ptr), @acc[1] - sbb 8*1($n_ptr), @acc[2] - sbb 8*2($n_ptr), @acc[3] - mov @acc[4], $hi - sbb 8*3($n_ptr), @acc[4] - sbb \$0, @acc[5] - - cmovc %rax, @acc[1] - cmovc $b_ptr, @acc[2] - cmovc @acc[0], @acc[3] - mov @acc[1], 8*0($a_ptr) - cmovc $hi, @acc[4] - mov @acc[2], 8*1($a_ptr) - mov @acc[3], 8*2($a_ptr) - mov @acc[4], 8*3($a_ptr) - - ret -.cfi_endproc -.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 -___ -} } -{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" - -$code.=<<___; -.globl from_mont_256 -.hidden from_mont_256 -.type from_mont_256,\@function,4,"unwind" -.align 32 -from_mont_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $b_org, $n_ptr - call __mulq_by_1_mont_256 - - ################################# - # Branch-less conditional acc[0:3] - modulus - - #mov @acc[4], %rax # __mulq_by_1_mont_256 does it - mov @acc[5], @acc[1] - mov @acc[6], @acc[2] - mov @acc[0], @acc[3] - - sub 8*0($n_ptr), @acc[4] - sbb 8*1($n_ptr), @acc[5] - sbb 8*2($n_ptr), @acc[6] - sbb 8*3($n_ptr), @acc[0] - - cmovnc @acc[4], %rax - cmovnc @acc[5], @acc[1] - cmovnc @acc[6], @acc[2] - mov %rax, 8*0($r_ptr) - cmovnc @acc[0], @acc[3] - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size from_mont_256,.-from_mont_256 - -.globl redc_mont_256 -.hidden redc_mont_256 -.type redc_mont_256,\@function,4,"unwind" -.align 32 -redc_mont_256: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $b_org, $n_ptr - call __mulq_by_1_mont_256 - - add 8*4($a_ptr), @acc[4] # accumulate upper half - adc 8*5($a_ptr), @acc[5] - mov @acc[4], %rax - adc 8*6($a_ptr), @acc[6] - mov @acc[5], @acc[1] - adc 8*7($a_ptr), @acc[0] - sbb $a_ptr, $a_ptr - - ################################# - # Branch-less conditional acc[0:4] - modulus - - mov @acc[6], @acc[2] - sub 8*0($n_ptr), @acc[4] - sbb 8*1($n_ptr), @acc[5] - sbb 8*2($n_ptr), @acc[6] - mov @acc[0], @acc[3] - sbb 8*3($n_ptr), @acc[0] - sbb \$0, $a_ptr - - cmovnc @acc[4], %rax - cmovnc @acc[5], @acc[1] - cmovnc @acc[6], @acc[2] - mov %rax, 8*0($r_ptr) - cmovnc @acc[0], @acc[3] - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size redc_mont_256,.-redc_mont_256 -___ -{ -my @acc=@acc; - -$code.=<<___; -.type __mulq_by_1_mont_256,\@abi-omnipotent -.align 32 -__mulq_by_1_mont_256: - mov 8*0($a_ptr), %rax - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - - mov %rax, @acc[4] - imulq $n0, %rax - mov %rax, @acc[0] -___ -for (my $i=0; $i<4; $i++) { -my $hi = @acc[4]; -$code.=<<___; - ################################# reduction $i - mulq 8*0($n_ptr) - add %rax, @acc[4] # guaranteed to be zero - mov @acc[0], %rax - adc %rdx, @acc[4] - - mulq 8*1($n_ptr) - add %rax, @acc[1] - mov @acc[0], %rax - adc \$0, %rdx - add @acc[4], @acc[1] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*2($n_ptr) -___ -$code.=<<___ if ($i<3); - mov @acc[1], @acc[5] - imulq $n0, @acc[1] -___ -$code.=<<___; - add %rax, @acc[2] - mov @acc[0], %rax - adc \$0, %rdx - add $hi, @acc[2] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*3($n_ptr) - add %rax, @acc[3] - mov @acc[1], %rax - adc \$0, %rdx - add $hi, @acc[3] - adc \$0, %rdx - mov %rdx, @acc[4] -___ - push(@acc,shift(@acc)); -} -$code.=<<___; - ret -.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 -___ -} } } - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/mulq_mont_384-x86_64.pl b/crypto/blst_src/asm/mulq_mont_384-x86_64.pl deleted file mode 100755 index 3812319e8ba..00000000000 --- a/crypto/blst_src/asm/mulq_mont_384-x86_64.pl +++ /dev/null @@ -1,2675 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -# common argument layout -($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); -$b_ptr = "%rbx"; - -# common accumulator layout -@acc=map("%r$_",(8..15)); - -######################################################################## -{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected - # except for $n_ptr and $r_ptr -$code.=<<___; -.text - -######################################################################## -# Double-width subtraction modulo n<<384, as opposite to naively -# expected modulo n*n. It works because n<<384 is the actual -# input boundary condition for Montgomery reduction, not n*n. -# Just in case, this is duplicated, but only one module is -# supposed to be linked... -.type __sub_mod_384x384,\@abi-omnipotent -.align 32 -__sub_mod_384x384: - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - mov 8*6($a_ptr), @acc[6] - - sub 8*0($b_org), @acc[0] - mov 8*7($a_ptr), @acc[7] - sbb 8*1($b_org), @acc[1] - mov 8*8($a_ptr), @acc[8] - sbb 8*2($b_org), @acc[2] - mov 8*9($a_ptr), @acc[9] - sbb 8*3($b_org), @acc[3] - mov 8*10($a_ptr), @acc[10] - sbb 8*4($b_org), @acc[4] - mov 8*11($a_ptr), @acc[11] - sbb 8*5($b_org), @acc[5] - mov @acc[0], 8*0($r_ptr) - sbb 8*6($b_org), @acc[6] - mov 8*0($n_ptr), @acc[0] - mov @acc[1], 8*1($r_ptr) - sbb 8*7($b_org), @acc[7] - mov 8*1($n_ptr), @acc[1] - mov @acc[2], 8*2($r_ptr) - sbb 8*8($b_org), @acc[8] - mov 8*2($n_ptr), @acc[2] - mov @acc[3], 8*3($r_ptr) - sbb 8*9($b_org), @acc[9] - mov 8*3($n_ptr), @acc[3] - mov @acc[4], 8*4($r_ptr) - sbb 8*10($b_org), @acc[10] - mov 8*4($n_ptr), @acc[4] - mov @acc[5], 8*5($r_ptr) - sbb 8*11($b_org), @acc[11] - mov 8*5($n_ptr), @acc[5] - sbb $b_org, $b_org - - and $b_org, @acc[0] - and $b_org, @acc[1] - and $b_org, @acc[2] - and $b_org, @acc[3] - and $b_org, @acc[4] - and $b_org, @acc[5] - - add @acc[0], @acc[6] - adc @acc[1], @acc[7] - mov @acc[6], 8*6($r_ptr) - adc @acc[2], @acc[8] - mov @acc[7], 8*7($r_ptr) - adc @acc[3], @acc[9] - mov @acc[8], 8*8($r_ptr) - adc @acc[4], @acc[10] - mov @acc[9], 8*9($r_ptr) - adc @acc[5], @acc[11] - mov @acc[10], 8*10($r_ptr) - mov @acc[11], 8*11($r_ptr) - - ret -.size __sub_mod_384x384,.-__sub_mod_384x384 - -.type __add_mod_384,\@abi-omnipotent -.align 32 -__add_mod_384: - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - add 8*0($b_org), @acc[0] - adc 8*1($b_org), @acc[1] - adc 8*2($b_org), @acc[2] - mov @acc[0], @acc[6] - adc 8*3($b_org), @acc[3] - mov @acc[1], @acc[7] - adc 8*4($b_org), @acc[4] - mov @acc[2], @acc[8] - adc 8*5($b_org), @acc[5] - mov @acc[3], @acc[9] - sbb $b_org, $b_org - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - mov @acc[4], @acc[10] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], @acc[11] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $b_org - - cmovc @acc[6], @acc[0] - cmovc @acc[7], @acc[1] - cmovc @acc[8], @acc[2] - mov @acc[0], 8*0($r_ptr) - cmovc @acc[9], @acc[3] - mov @acc[1], 8*1($r_ptr) - cmovc @acc[10], @acc[4] - mov @acc[2], 8*2($r_ptr) - cmovc @acc[11], @acc[5] - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - ret -.size __add_mod_384,.-__add_mod_384 - -.type __sub_mod_384,\@abi-omnipotent -.align 32 -__sub_mod_384: - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - -__sub_mod_384_a_is_loaded: - sub 8*0($b_org), @acc[0] - mov 8*0($n_ptr), @acc[6] - sbb 8*1($b_org), @acc[1] - mov 8*1($n_ptr), @acc[7] - sbb 8*2($b_org), @acc[2] - mov 8*2($n_ptr), @acc[8] - sbb 8*3($b_org), @acc[3] - mov 8*3($n_ptr), @acc[9] - sbb 8*4($b_org), @acc[4] - mov 8*4($n_ptr), @acc[10] - sbb 8*5($b_org), @acc[5] - mov 8*5($n_ptr), @acc[11] - sbb $b_org, $b_org - - and $b_org, @acc[6] - and $b_org, @acc[7] - and $b_org, @acc[8] - and $b_org, @acc[9] - and $b_org, @acc[10] - and $b_org, @acc[11] - - add @acc[6], @acc[0] - adc @acc[7], @acc[1] - mov @acc[0], 8*0($r_ptr) - adc @acc[8], @acc[2] - mov @acc[1], 8*1($r_ptr) - adc @acc[9], @acc[3] - mov @acc[2], 8*2($r_ptr) - adc @acc[10], @acc[4] - mov @acc[3], 8*3($r_ptr) - adc @acc[11], @acc[5] - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - ret -.size __sub_mod_384,.-__sub_mod_384 -___ -} - -######################################################################## -# "Complex" multiplication and squaring. Use vanilla multiplication when -# possible to fold reductions. I.e. instead of mul_mont, mul_mont -# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod -# followed by *common* reduction... -{ my $frame = 5*8 + # place for argument off-load + - 3*768/8; # place for 3 768-bit temporary vectors -$code.=<<___; -.globl mul_mont_384x -.hidden mul_mont_384x -.type mul_mont_384x,\@function,5,"unwind" -.align 32 -mul_mont_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - mov $b_org, $b_ptr - mov $r_ptr, 8*4(%rsp) # offload arguments - mov $a_ptr, 8*3(%rsp) - mov $b_org, 8*2(%rsp) - mov $n_ptr, 8*1(%rsp) - mov $n0, 8*0(%rsp) - - ################################# mul_384(t0, a->re, b->re); - #lea 0($b_btr), $b_ptr # b->re - #lea 0($a_ptr), $a_ptr # a->re - lea 40(%rsp), $r_ptr # t0 - call __mulq_384 - - ################################# mul_384(t1, a->im, b->im); - lea 48($b_ptr), $b_ptr # b->im - lea 48($a_ptr), $a_ptr # a->im - lea 40+96(%rsp), $r_ptr # t1 - call __mulq_384 - - ################################# mul_384(t2, a->re+a->im, b->re+b->im); - mov 8*1(%rsp), $n_ptr - lea -48($a_ptr), $b_org - lea 40+192+48(%rsp), $r_ptr - call __add_mod_384 - - mov 8*2(%rsp), $a_ptr - lea 48($a_ptr), $b_org - lea -48($r_ptr), $r_ptr - call __add_mod_384 - - lea ($r_ptr),$b_ptr - lea 48($r_ptr),$a_ptr - call __mulq_384 - - ################################# t2=t2-t0-t1 - lea ($r_ptr), $a_ptr # t2 - lea 40(%rsp), $b_org # t0 - mov 8*1(%rsp), $n_ptr - call __sub_mod_384x384 # t2=t2-t0 - - lea ($r_ptr), $a_ptr # t2 - lea -96($r_ptr), $b_org # t1 - call __sub_mod_384x384 # t2=t2-t1 - - ################################# t0=t0-t1 - lea 40(%rsp), $a_ptr - lea 40+96(%rsp), $b_org - lea 40(%rsp), $r_ptr - call __sub_mod_384x384 # t0-t1 - - mov $n_ptr, $b_ptr # n_ptr for redc_mont_384 - - ################################# redc_mont_384(ret->re, t0, mod, n0); - lea 40(%rsp), $a_ptr # t0 - mov 8*0(%rsp), %rcx # n0 for redc_mont_384 - mov 8*4(%rsp), $r_ptr # ret->re - call __mulq_by_1_mont_384 - call __redc_tail_mont_384 - - ################################# redc_mont_384(ret->im, t2, mod, n0); - lea 40+192(%rsp), $a_ptr # t2 - mov 8*0(%rsp), %rcx # n0 for redc_mont_384 - lea 48($r_ptr), $r_ptr # ret->im - call __mulq_by_1_mont_384 - call __redc_tail_mont_384 - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size mul_mont_384x,.-mul_mont_384x -___ -} -{ my $frame = 4*8 + # place for argument off-load + - 2*384/8 + # place for 2 384-bit temporary vectors - 8; # align -$code.=<<___; -.globl sqr_mont_384x -.hidden sqr_mont_384x -.type sqr_mont_384x,\@function,4,"unwind" -.align 32 -sqr_mont_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - mov $n_ptr, 8*0(%rsp) # n0 - mov $b_org, $n_ptr # n_ptr - mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 - mov $a_ptr, 8*2(%rsp) - - ################################# add_mod_384(t0, a->re, a->im); - lea 48($a_ptr), $b_org # a->im - lea 32(%rsp), $r_ptr # t0 - call __add_mod_384 - - ################################# sub_mod_384(t1, a->re, a->im); - mov 8*2(%rsp), $a_ptr # a->re - lea 48($a_ptr), $b_org # a->im - lea 32+48(%rsp), $r_ptr # t1 - call __sub_mod_384 - - ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); - mov 8*2(%rsp), $a_ptr # a->re - lea 48($a_ptr), $b_ptr # a->im - - mov 48($a_ptr), %rax # a->im - mov 8*0($a_ptr), @acc[6] # a->re - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[4] - mov 8*3($a_ptr), @acc[5] - - call __mulq_mont_384 -___ -{ -my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 - 12,13,"ax","bx","bp","si"); -$code.=<<___; - add @acc[0], @acc[0] # add with itself - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - mov @acc[0], @acc[6] - adc @acc[3], @acc[3] - mov @acc[1], @acc[7] - adc @acc[4], @acc[4] - mov @acc[2], @acc[8] - adc @acc[5], @acc[5] - mov @acc[3], @acc[9] - sbb $b_org, $b_org - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - mov @acc[4], @acc[10] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], @acc[11] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $b_org - - cmovc @acc[6], @acc[0] - cmovc @acc[7], @acc[1] - cmovc @acc[8], @acc[2] - mov @acc[0], 8*6($r_ptr) # ret->im - cmovc @acc[9], @acc[3] - mov @acc[1], 8*7($r_ptr) - cmovc @acc[10], @acc[4] - mov @acc[2], 8*8($r_ptr) - cmovc @acc[11], @acc[5] - mov @acc[3], 8*9($r_ptr) - mov @acc[4], 8*10($r_ptr) - mov @acc[5], 8*11($r_ptr) -___ -} -$code.=<<___; - ################################# mul_mont_384(ret->re, t0, t1, mod, n0); - lea 32(%rsp), $a_ptr # t0 - lea 32+48(%rsp), $b_ptr # t1 - - mov 32+48(%rsp), %rax # t1[0] - mov 32+8*0(%rsp), @acc[6] # t0[0..3] - mov 32+8*1(%rsp), @acc[7] - mov 32+8*2(%rsp), @acc[4] - mov 32+8*3(%rsp), @acc[5] - - call __mulq_mont_384 - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size sqr_mont_384x,.-sqr_mont_384x - -.globl mul_382x -.hidden mul_382x -.type mul_382x,\@function,4,"unwind" -.align 32 -mul_382x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - lea 96($r_ptr), $r_ptr # ret->im - mov $a_ptr, 8*0(%rsp) - mov $b_org, 8*1(%rsp) - mov $r_ptr, 8*2(%rsp) # offload ret->im - mov $n_ptr, 8*3(%rsp) - - ################################# t0 = a->re + a->im - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - add 8*6($a_ptr), @acc[0] - adc 8*7($a_ptr), @acc[1] - adc 8*8($a_ptr), @acc[2] - adc 8*9($a_ptr), @acc[3] - adc 8*10($a_ptr), @acc[4] - adc 8*11($a_ptr), @acc[5] - - mov @acc[0], 32+8*0(%rsp) - mov @acc[1], 32+8*1(%rsp) - mov @acc[2], 32+8*2(%rsp) - mov @acc[3], 32+8*3(%rsp) - mov @acc[4], 32+8*4(%rsp) - mov @acc[5], 32+8*5(%rsp) - - ################################# t1 = b->re + b->im - mov 8*0($b_org), @acc[0] - mov 8*1($b_org), @acc[1] - mov 8*2($b_org), @acc[2] - mov 8*3($b_org), @acc[3] - mov 8*4($b_org), @acc[4] - mov 8*5($b_org), @acc[5] - - add 8*6($b_org), @acc[0] - adc 8*7($b_org), @acc[1] - adc 8*8($b_org), @acc[2] - adc 8*9($b_org), @acc[3] - adc 8*10($b_org), @acc[4] - adc 8*11($b_org), @acc[5] - - mov @acc[0], 32+8*6(%rsp) - mov @acc[1], 32+8*7(%rsp) - mov @acc[2], 32+8*8(%rsp) - mov @acc[3], 32+8*9(%rsp) - mov @acc[4], 32+8*10(%rsp) - mov @acc[5], 32+8*11(%rsp) - - ################################# mul_384(ret->im, t0, t1); - lea 32+8*0(%rsp), $a_ptr # t0 - lea 32+8*6(%rsp), $b_ptr # t1 - call __mulq_384 - - ################################# mul_384(ret->re, a->re, b->re); - mov 8*0(%rsp), $a_ptr - mov 8*1(%rsp), $b_ptr - lea -96($r_ptr), $r_ptr # ret->re - call __mulq_384 - - ################################# mul_384(tx, a->im, b->im); - lea 48($a_ptr), $a_ptr - lea 48($b_ptr), $b_ptr - lea 32(%rsp), $r_ptr - call __mulq_384 - - ################################# ret->im -= tx - mov 8*2(%rsp), $a_ptr # restore ret->im - lea 32(%rsp), $b_org - mov 8*3(%rsp), $n_ptr - mov $a_ptr, $r_ptr - call __sub_mod_384x384 - - ################################# ret->im -= ret->re - lea 0($r_ptr), $a_ptr - lea -96($r_ptr), $b_org - call __sub_mod_384x384 - - ################################# ret->re -= tx - lea -96($r_ptr), $a_ptr - lea 32(%rsp), $b_org - lea -96($r_ptr), $r_ptr - call __sub_mod_384x384 - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size mul_382x,.-mul_382x -___ -} -{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected - # except for $n_ptr and $r_ptr -$code.=<<___; -.globl sqr_382x -.hidden sqr_382x -.type sqr_382x,\@function,3,"unwind" -.align 32 -sqr_382x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $a_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $b_org, $n_ptr - - ################################# t0 = a->re + a->im - mov 8*0($a_ptr), @acc[6] - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov 8*3($a_ptr), @acc[9] - mov 8*4($a_ptr), @acc[10] - mov 8*5($a_ptr), @acc[11] - - mov @acc[6], @acc[0] - add 8*6($a_ptr), @acc[6] - mov @acc[7], @acc[1] - adc 8*7($a_ptr), @acc[7] - mov @acc[8], @acc[2] - adc 8*8($a_ptr), @acc[8] - mov @acc[9], @acc[3] - adc 8*9($a_ptr), @acc[9] - mov @acc[10], @acc[4] - adc 8*10($a_ptr), @acc[10] - mov @acc[11], @acc[5] - adc 8*11($a_ptr), @acc[11] - - mov @acc[6], 8*0($r_ptr) - mov @acc[7], 8*1($r_ptr) - mov @acc[8], 8*2($r_ptr) - mov @acc[9], 8*3($r_ptr) - mov @acc[10], 8*4($r_ptr) - mov @acc[11], 8*5($r_ptr) - - ################################# t1 = a->re - a->im - lea 48($a_ptr), $b_org - lea 48($r_ptr), $r_ptr - call __sub_mod_384_a_is_loaded - - ################################# mul_384(ret->re, t0, t1); - lea ($r_ptr), $a_ptr - lea -48($r_ptr), $b_ptr - lea -48($r_ptr), $r_ptr - call __mulq_384 - - ################################# mul_384(ret->im, a->re, a->im); - mov (%rsp), $a_ptr - lea 48($a_ptr), $b_ptr - lea 96($r_ptr), $r_ptr - call __mulq_384 - - mov 8*0($r_ptr), @acc[0] # double ret->im - mov 8*1($r_ptr), @acc[1] - mov 8*2($r_ptr), @acc[2] - mov 8*3($r_ptr), @acc[3] - mov 8*4($r_ptr), @acc[4] - mov 8*5($r_ptr), @acc[5] - mov 8*6($r_ptr), @acc[6] - mov 8*7($r_ptr), @acc[7] - mov 8*8($r_ptr), @acc[8] - mov 8*9($r_ptr), @acc[9] - mov 8*10($r_ptr), @acc[10] - add @acc[0], @acc[0] - mov 8*11($r_ptr), @acc[11] - adc @acc[1], @acc[1] - mov @acc[0], 8*0($r_ptr) - adc @acc[2], @acc[2] - mov @acc[1], 8*1($r_ptr) - adc @acc[3], @acc[3] - mov @acc[2], 8*2($r_ptr) - adc @acc[4], @acc[4] - mov @acc[3], 8*3($r_ptr) - adc @acc[5], @acc[5] - mov @acc[4], 8*4($r_ptr) - adc @acc[6], @acc[6] - mov @acc[5], 8*5($r_ptr) - adc @acc[7], @acc[7] - mov @acc[6], 8*6($r_ptr) - adc @acc[8], @acc[8] - mov @acc[7], 8*7($r_ptr) - adc @acc[9], @acc[9] - mov @acc[8], 8*8($r_ptr) - adc @acc[10], @acc[10] - mov @acc[9], 8*9($r_ptr) - adc @acc[11], @acc[11] - mov @acc[10], 8*10($r_ptr) - mov @acc[11], 8*11($r_ptr) - - mov 8*1(%rsp),%r15 -.cfi_restore %r15 - mov 8*2(%rsp),%r14 -.cfi_restore %r14 - mov 8*3(%rsp),%r13 -.cfi_restore %r13 - mov 8*4(%rsp),%r12 -.cfi_restore %r12 - mov 8*5(%rsp),%rbx -.cfi_restore %rbx - mov 8*6(%rsp),%rbp -.cfi_restore %rbp - lea 8*7(%rsp),%rsp -.cfi_adjust_cfa_offset -8*7 -.cfi_epilogue - ret -.cfi_endproc -.size sqr_382x,.-sqr_382x -___ -} -{ ########################################################## 384-bit mul -my @acc=map("%r$_",("cx",8..12)); -my $bi = "%rbp"; - -$code.=<<___; -.globl mul_384 -.hidden mul_384 -.type mul_384,\@function,3,"unwind" -.align 32 -mul_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 -.cfi_end_prologue - - mov $b_org, $b_ptr - call __mulq_384 - - mov 0(%rsp),%r12 -.cfi_restore %r12 - mov 8(%rsp),%rbx -.cfi_restore %rbx - mov 16(%rsp),%rbp -.cfi_restore %rbp - lea 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 -.cfi_epilogue - ret -.cfi_endproc -.size mul_384,.-mul_384 - -.type __mulq_384,\@abi-omnipotent -.align 32 -__mulq_384: - mov 8*0($b_ptr), %rax - - mov %rax, $bi - mulq 8*0($a_ptr) - mov %rax, 8*0($r_ptr) - mov $bi, %rax - mov %rdx, @acc[0] - - mulq 8*1($a_ptr) - add %rax, @acc[0] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[1] - - mulq 8*2($a_ptr) - add %rax, @acc[1] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[2] - - mulq 8*3($a_ptr) - add %rax, @acc[2] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[3] - - mulq 8*4($a_ptr) - add %rax, @acc[3] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[4] - - mulq 8*5($a_ptr) - add %rax, @acc[4] - mov 8*1($b_ptr), %rax - adc \$0, %rdx - mov %rdx, @acc[5] -___ -for(my $i=1; $i<6; $i++) { -my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; -$code.=<<___; - mov %rax, $bi - mulq 8*0($a_ptr) - add %rax, @acc[0] - mov $bi, %rax - adc \$0, %rdx - mov @acc[0], 8*$i($r_ptr) - mov %rdx, @acc[0] - - mulq 8*1($a_ptr) - add %rax, @acc[1] - mov $bi, %rax - adc \$0, %rdx - add @acc[1], @acc[0] - adc \$0, %rdx - mov %rdx, @acc[1] - - mulq 8*2($a_ptr) - add %rax, @acc[2] - mov $bi, %rax - adc \$0, %rdx - add @acc[2], @acc[1] - adc \$0, %rdx - mov %rdx, @acc[2] - - mulq 8*3($a_ptr) - add %rax, @acc[3] - mov $bi, %rax - adc \$0, %rdx - add @acc[3], @acc[2] - adc \$0, %rdx - mov %rdx, @acc[3] - - mulq 8*4($a_ptr) - add %rax, @acc[4] - mov $bi, %rax - adc \$0, %rdx - add @acc[4], @acc[3] - adc \$0, %rdx - mov %rdx, @acc[4] - - mulq 8*5($a_ptr) - add %rax, @acc[5] - mov $b_next, %rax - adc \$0, %rdx - add @acc[5], @acc[4] - adc \$0, %rdx - mov %rdx, @acc[5] -___ -} -$code.=<<___; - mov @acc[0], 8*6($r_ptr) - mov @acc[1], 8*7($r_ptr) - mov @acc[2], 8*8($r_ptr) - mov @acc[3], 8*9($r_ptr) - mov @acc[4], 8*10($r_ptr) - mov @acc[5], 8*11($r_ptr) - - ret -.size __mulq_384,.-__mulq_384 -___ -} -if (0) { ############################################################## -my @b=map("%r$_",(10..15)); -my @a=reverse(@b); - @b[5]=$b_ptr; -my $bi = "%rbp"; -my @comba=map("%r$_",("cx",8,9)); -# a[0]*b[0] -# a[1]*b[0] -# a[0]*b[1] -# a[2]*b[0] -# a[1]*b[1] -# a[0]*b[2] -# a[3]*b[0] -# a[2]*b[1] -# a[1]*b[2] -# a[0]*b[3] -# a[4]*b[0] -# a[3]*b[1] -# a[2]*b[2] -# a[1]*b[3] -# a[0]*b[4] -# a[5]*b[0] -# a[4]*b[1] -# a[3]*b[2] -# a[2]*b[3] -# a[1]*b[4] -# a[0]*b[5] -# a[5]*b[1] -# a[4]*b[2] -# a[3]*b[3] -# a[2]*b[4] -# a[1]*b[5] -# a[5]*b[2] -# a[4]*b[3] -# a[3]*b[4] -# a[2]*b[5] -# a[5]*b[3] -# a[4]*b[4] -# a[3]*b[5] -# a[5]*b[4] -# a[4]*b[5] -# a[5]*b[5] -# -# 13% less instructions give +15% on Core2, +10% on Goldmont, -# -0% on Sandy Bridge, but -16% on Haswell:-( -# [for reference +5% on Skylake, +11% on Ryzen] - -$code.=<<___; -.type __mulq_comba_384,\@abi-omnipotent -.align 32 -__mulq_comba_384: - mov 8*0($b_ptr), %rax - mov 8*0($a_ptr), @a[0] - mov 8*1($a_ptr), @a[1] - mov 8*1($b_ptr), @b[1] - - mov %rax, @b[0] - mulq @a[0] # a[0]*b[0] - mov %rax, 8*0($r_ptr) - mov @b[0], %rax - mov %rdx, @comba[0] - - ################################# - mov 8*2($a_ptr), @a[2] - xor @comba[2], @comba[2] - mulq @a[1] # a[1]*b[0] - add %rax, @comba[0] - mov @b[1], %rax - adc \$0, %rdx - mov 8*2($b_ptr), @b[2] - mov %rdx, @comba[1] - - mulq @a[0] # a[0]*b[1] - add %rax, @comba[0] - mov @b[0], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - mov @comba[0], 8*1($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - xor @comba[2], @comba[2] - mulq @a[2] # a[2]*b[0] - add %rax, @comba[0] - mov @b[1], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[1] # a[1]*b[1] - add %rax, @comba[0] - mov @b[2], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[0] # a[0]*b[2] - add %rax, @comba[0] - mov @b[0], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - mov @comba[0], 8*2($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - xor @comba[2], @comba[2] - mulq 8*3($a_ptr) # a[3]*b[0] - add %rax, @comba[0] - mov @b[1], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[2] # a[2]*b[1] - add %rax, @comba[0] - mov @b[2], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[1] # a[1]*b[2] - add %rax, @comba[0] - mov 8*3($b_ptr), %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mov %rax, @b[3] - mulq @a[0] # a[0]*b[3] - add %rax, @comba[0] - mov @b[0], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - mov @comba[0], 8*3($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - xor @comba[2], @comba[2] - mulq 8*4($a_ptr) # a[4]*b[0] - add %rax, @comba[0] - mov @b[1], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*3($a_ptr) # a[3]*b[1] - add %rax, @comba[0] - mov @b[2], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*2($a_ptr) # a[2]*b[2] - add %rax, @comba[0] - mov @b[3], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[1] # a[1]*b[3] - add %rax, @comba[0] - mov 8*4($b_ptr), %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mov %rax, @b[4] - mulq @a[0] # a[0]*b[4] - add %rax, @comba[0] - mov @b[0], %rax - adc %rdx, @comba[1] - mov 8*5($a_ptr), @a[5] - adc \$0, @comba[2] - mov @comba[0], 8*4($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - xor @comba[2], @comba[2] - mulq @a[5] # a[5]*b[0] - add %rax, @comba[0] - mov @b[1], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*4($a_ptr) # a[4]*b[1] - add %rax, @comba[0] - mov @b[2], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*3($a_ptr) # a[3]*b[2] - add %rax, @comba[0] - mov @b[3], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*2($a_ptr) # a[2]*b[3] - add %rax, @comba[0] - mov @b[4], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*1($a_ptr) # a[1]*b[4] - add %rax, @comba[0] - mov 8*5($b_ptr), %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mov %rax, @b[5] - mulq @a[0] # a[0]*b[5] - add %rax, @comba[0] - mov @b[1], %rax - adc %rdx, @comba[1] - mov 8*4($a_ptr), @a[4] - adc \$0, @comba[2] - mov @comba[0], 8*5($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - xor @comba[2], @comba[2] - mulq @a[5] # a[5]*b[1] - add %rax, @comba[0] - mov @b[2], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[4] # a[4]*b[2] - add %rax, @comba[0] - mov @b[3], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*3($a_ptr) # a[3]*b[3] - add %rax, @comba[0] - mov @b[4], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*2($a_ptr) # a[2]*b[4] - add %rax, @comba[0] - mov @b[5], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*1($a_ptr) # a[1]*b[5] - add %rax, @comba[0] - mov $b[2], %rax - adc %rdx, @comba[1] - mov 8*3($a_ptr), @a[3] - adc \$0, @comba[2] - mov @comba[0], 8*6($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - xor @comba[2], @comba[2] - mulq @a[5] # a[5]*b[2] - add %rax, @comba[0] - mov @b[3], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[4] # a[4]*b[3] - add %rax, @comba[0] - mov @b[4], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[3] # a[3]*b[4] - add %rax, @comba[0] - mov @b[5], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq 8*2($a_ptr) # a[2]*b[5] - add %rax, @comba[0] - mov @b[3], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - mov @comba[0], 8*7($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - xor @comba[2], @comba[2] - mulq @a[5] # a[5]*b[3] - add %rax, @comba[0] - mov @b[4], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[4] # a[4]*b[4] - add %rax, @comba[0] - mov @b[5], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[3] # a[3]*b[5] - add %rax, @comba[0] - mov @b[4], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - mov @comba[0], 8*8($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - xor @comba[2], @comba[2] - mulq @a[5] # a[5]*b[4] - add %rax, @comba[0] - mov @b[5], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - - mulq @a[4] # a[4]*b[5] - add %rax, @comba[0] - mov @b[5], %rax - adc %rdx, @comba[1] - adc \$0, @comba[2] - mov @comba[0], 8*9($r_ptr) -___ - push(@comba,shift(@comba)); -$code.=<<___; - mulq @a[5] # a[5]*b[4] - add %rax, @comba[0] - adc %rdx, @comba[1] - - mov @comba[0], 8*10($r_ptr) - mov @comba[1], 8*11($r_ptr) - - ret -.size __mulq_comba_384,.-__mulq_comba_384 -___ -} -{ ########################################################## 384-bit sqr -my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr); -my $hi; - -$code.=<<___; -.globl sqr_384 -.hidden sqr_384 -.type sqr_384,\@function,2,"unwind" -.align 32 -sqr_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - call __sqrq_384 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sqr_384,.-sqr_384 - -.type __sqrq_384,\@abi-omnipotent -.align 32 -__sqrq_384: - mov 8*0($a_ptr), %rax - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov 8*3($a_ptr), @acc[9] - - ######################################### - mov %rax, @acc[6] - mulq @acc[7] # a[1]*a[0] - mov %rax, @acc[1] - mov @acc[6], %rax - mov 8*4($a_ptr), @acc[10] - mov %rdx, @acc[2] - - mulq @acc[8] # a[2]*a[0] - add %rax, @acc[2] - mov @acc[6], %rax - adc \$0, %rdx - mov 8*5($a_ptr), @acc[11] - mov %rdx, @acc[3] - - mulq @acc[9] # a[3]*a[0] - add %rax, @acc[3] - mov @acc[6], %rax - adc \$0, %rdx - mov %rdx, @acc[4] - - mulq @acc[10] # a[4]*a[0] - add %rax, @acc[4] - mov @acc[6], %rax - adc \$0, %rdx - mov %rdx, @acc[5] - - mulq @acc[11] # a[5]*a[0] - add %rax, @acc[5] - mov @acc[6], %rax - adc \$0, %rdx - mov %rdx, @acc[6] - - mulq %rax # a[0]*a[0] - xor @acc[0], @acc[0] - mov %rax, 8*0($r_ptr) - mov @acc[7], %rax - add @acc[1], @acc[1] # double acc[1] - adc \$0, @acc[0] - add %rdx, @acc[1] # accumulate a[0]*a[0] - adc \$0, @acc[0] # carries to a[1]*a[1] - mov @acc[1], 8*1($r_ptr) -___ -$hi=@acc[1]; -$code.=<<___; - ######################################### - mulq @acc[8] # a[2]*a[1] - add %rax, @acc[3] - mov @acc[7], %rax - adc \$0, %rdx - mov %rdx, $hi - - mulq @acc[9] # a[3]*a[1] - add %rax, @acc[4] - mov @acc[7], %rax - adc \$0, %rdx - add $hi, @acc[4] - adc \$0, %rdx - mov %rdx, $hi - - mulq @acc[10] # a[4]*a[1] - add %rax, @acc[5] - mov @acc[7], %rax - adc \$0, %rdx - add $hi, @acc[5] - adc \$0, %rdx - mov %rdx, $hi - - mulq @acc[11] # a[5]*a[1] - add %rax, @acc[6] - mov @acc[7], %rax - adc \$0, %rdx - add $hi, @acc[6] - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq %rax # a[1]*a[1] - xor @acc[1], @acc[1] - add %rax, @acc[0] # can't carry - mov @acc[8], %rax - add @acc[2], @acc[2] # double acc[2:3] - adc @acc[3], @acc[3] - adc \$0, @acc[1] - add @acc[0], @acc[2] # accumulate a[1]*a[1] - adc %rdx, @acc[3] - adc \$0, @acc[1] # carries to a[2]*a[2] - mov @acc[2], 8*2($r_ptr) -___ -$hi=@acc[0]; -$code.=<<___; - ######################################### - mulq @acc[9] # a[3]*a[2] - add %rax, @acc[5] - mov @acc[8], %rax - adc \$0, %rdx - mov @acc[3], 8*3($r_ptr) - mov %rdx, $hi - - mulq @acc[10] # a[4]*a[2] - add %rax, @acc[6] - mov @acc[8], %rax - adc \$0, %rdx - add $hi, @acc[6] - adc \$0, %rdx - mov %rdx, $hi - - mulq @acc[11] # a[5]*a[2] - add %rax, @acc[7] - mov @acc[8], %rax - adc \$0, %rdx - add $hi, @acc[7] - adc \$0, %rdx - mov %rdx, @acc[8] - - mulq %rax # a[2]*a[2] - xor @acc[3], @acc[3] - add %rax, @acc[1] # can't carry - mov @acc[9], %rax - add @acc[4], @acc[4] # double acc[4:5] - adc @acc[5], @acc[5] - adc \$0, @acc[3] - add @acc[1], @acc[4] # accumulate a[2]*a[2] - adc %rdx, @acc[5] - adc \$0, @acc[3] # carries to a[3]*a[3] - mov @acc[4], 8*4($r_ptr) - - ######################################### - mulq @acc[10] # a[4]*a[3] - add %rax, @acc[7] - mov @acc[9], %rax - adc \$0, %rdx - mov @acc[5], 8*5($r_ptr) - mov %rdx, $hi - - mulq @acc[11] # a[5]*a[3] - add %rax, @acc[8] - mov @acc[9], %rax - adc \$0, %rdx - add $hi, @acc[8] - adc \$0, %rdx - mov %rdx, @acc[9] - - mulq %rax # a[3]*a[3] - xor @acc[4], @acc[4] - add %rax, @acc[3] # can't carry - mov @acc[10], %rax - add @acc[6], @acc[6] # double acc[6:7] - adc @acc[7], @acc[7] - adc \$0, @acc[4] - add @acc[3], @acc[6] # accumulate a[3]*a[3] - adc %rdx, @acc[7] - mov @acc[6], 8*6($r_ptr) - adc \$0, @acc[4] # carries to a[4]*a[4] - mov @acc[7], 8*7($r_ptr) - - ######################################### - mulq @acc[11] # a[5]*a[4] - add %rax, @acc[9] - mov @acc[10], %rax - adc \$0, %rdx - mov %rdx, @acc[10] - - mulq %rax # a[4]*a[4] - xor @acc[5], @acc[5] - add %rax, @acc[4] # can't carry - mov @acc[11], %rax - add @acc[8], @acc[8] # double acc[8:9] - adc @acc[9], @acc[9] - adc \$0, @acc[5] - add @acc[4], @acc[8] # accumulate a[4]*a[4] - adc %rdx, @acc[9] - mov @acc[8], 8*8($r_ptr) - adc \$0, @acc[5] # carries to a[5]*a[5] - mov @acc[9], 8*9($r_ptr) - - ######################################### - mulq %rax # a[5]*a[5] - add @acc[5], %rax # can't carry - add @acc[10], @acc[10] # double acc[10] - adc \$0, %rdx - add @acc[10], %rax # accumulate a[5]*a[5] - adc \$0, %rdx - mov %rax, 8*10($r_ptr) - mov %rdx, 8*11($r_ptr) - - ret -.size __sqrq_384,.-__sqrq_384 - -.globl sqr_mont_384 -.hidden sqr_mont_384 -.type sqr_mont_384,\@function,4,"unwind" -.align 32 -sqr_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8*15, %rsp -.cfi_adjust_cfa_offset 8*15 -.cfi_end_prologue - - mov $n_ptr, 8*12(%rsp) # n0 - mov $b_org, 8*13(%rsp) # n_ptr - mov $r_ptr, 8*14(%rsp) - - mov %rsp, $r_ptr - call __sqrq_384 - - lea 0(%rsp), $a_ptr - mov 8*12(%rsp), %rcx # n0 for mul_by_1 - mov 8*13(%rsp), $b_ptr # n_ptr for mul_by_1 - mov 8*14(%rsp), $r_ptr - call __mulq_by_1_mont_384 - call __redc_tail_mont_384 - - lea 8*15(%rsp), %r8 # size optimization - mov 8*15(%rsp), %r15 -.cfi_restore %r15 - mov 8*1(%r8), %r14 -.cfi_restore %r14 - mov 8*2(%r8), %r13 -.cfi_restore %r13 - mov 8*3(%r8), %r12 -.cfi_restore %r12 - mov 8*4(%r8), %rbx -.cfi_restore %rbx - mov 8*5(%r8), %rbp -.cfi_restore %rbp - lea 8*6(%r8), %rsp -.cfi_adjust_cfa_offset -8*21 -.cfi_epilogue - ret -.cfi_endproc -.size sqr_mont_384,.-sqr_mont_384 -___ -} -{ ########################################################## 384-bit redc_mont -my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" - -$code.=<<___; -######################################################################## -# void redc_mont_384(uint64_t ret[6], const uint64_t a[12], -# uint64_t m[6], uint64_t n0); -.globl redc_mont_384 -.hidden redc_mont_384 -.type redc_mont_384,\@function,4,"unwind" -.align 32 -redc_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $b_org, $n_ptr - call __mulq_by_1_mont_384 - call __redc_tail_mont_384 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size redc_mont_384,.-redc_mont_384 - -######################################################################## -# void from_mont_384(uint64_t ret[6], const uint64_t a[6], -# uint64_t m[6], uint64_t n0); -.globl from_mont_384 -.hidden from_mont_384 -.type from_mont_384,\@function,4,"unwind" -.align 32 -from_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $b_org, $n_ptr - call __mulq_by_1_mont_384 - - ################################# - # Branch-less conditional acc[0:6] - modulus - - #mov @acc[6], %rax # __mulq_by_1_mont_384 does it - mov @acc[7], %rcx - mov @acc[0], %rdx - mov @acc[1], %rbp - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[7] - mov @acc[2], @acc[5] - sbb 8*2($n_ptr), @acc[0] - sbb 8*3($n_ptr), @acc[1] - sbb 8*4($n_ptr), @acc[2] - mov @acc[3], $a_ptr - sbb 8*5($n_ptr), @acc[3] - - cmovc %rax, @acc[6] - cmovc %rcx, @acc[7] - cmovc %rdx, @acc[0] - mov @acc[6], 8*0($r_ptr) - cmovc %rbp, @acc[1] - mov @acc[7], 8*1($r_ptr) - cmovc @acc[5], @acc[2] - mov @acc[0], 8*2($r_ptr) - cmovc $a_ptr, @acc[3] - mov @acc[1], 8*3($r_ptr) - mov @acc[2], 8*4($r_ptr) - mov @acc[3], 8*5($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size from_mont_384,.-from_mont_384 -___ -{ my @acc=@acc; # will be rotated locally - -$code.=<<___; -.type __mulq_by_1_mont_384,\@abi-omnipotent -.align 32 -__mulq_by_1_mont_384: - mov 8*0($a_ptr), %rax - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - mov %rax, @acc[6] - imulq $n0, %rax - mov %rax, @acc[0] -___ -for (my $i=0; $i<6; $i++) { -my $hi = @acc[6]; -$code.=<<___; - ################################# reduction $i - mulq 8*0($n_ptr) - add %rax, @acc[6] # guaranteed to be zero - mov @acc[0], %rax - adc %rdx, @acc[6] - - mulq 8*1($n_ptr) - add %rax, @acc[1] - mov @acc[0], %rax - adc \$0, %rdx - add @acc[6], @acc[1] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*2($n_ptr) - add %rax, @acc[2] - mov @acc[0], %rax - adc \$0, %rdx - add $hi, @acc[2] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*3($n_ptr) - add %rax, @acc[3] - mov @acc[0], %rax - adc \$0, %rdx -___ -$code.=<<___ if ($i<5); - mov @acc[1], @acc[7] - imulq $n0, @acc[1] -___ -$code.=<<___; - add $hi, @acc[3] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*4($n_ptr) - add %rax, @acc[4] - mov @acc[0], %rax - adc \$0, %rdx - add $hi, @acc[4] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*5($n_ptr) - add %rax, @acc[5] - mov @acc[1], %rax - adc \$0, %rdx - add $hi, @acc[5] - adc \$0, %rdx - mov %rdx, @acc[6] -___ - push(@acc,shift(@acc)); -} -$code.=<<___; - ret -.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 - -.type __redc_tail_mont_384,\@abi-omnipotent -.align 32 -__redc_tail_mont_384: - add 8*6($a_ptr), @acc[0] # accumulate upper half - mov @acc[0], %rax - adc 8*7($a_ptr), @acc[1] - adc 8*8($a_ptr), @acc[2] - adc 8*9($a_ptr), @acc[3] - mov @acc[1], %rcx - adc 8*10($a_ptr), @acc[4] - adc 8*11($a_ptr), @acc[5] - sbb @acc[6], @acc[6] - - ################################# - # Branch-less conditional acc[0:6] - modulus - - mov @acc[2], %rdx - mov @acc[3], %rbp - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - mov @acc[4], @acc[7] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], $a_ptr - sbb 8*5($n_ptr), @acc[5] - sbb \$0, @acc[6] - - cmovc %rax, @acc[0] - cmovc %rcx, @acc[1] - cmovc %rdx, @acc[2] - mov @acc[0], 8*0($r_ptr) - cmovc %rbp, @acc[3] - mov @acc[1], 8*1($r_ptr) - cmovc @acc[7], @acc[4] - mov @acc[2], 8*2($r_ptr) - cmovc $a_ptr, @acc[5] - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - ret -.size __redc_tail_mont_384,.-__redc_tail_mont_384 - -.globl sgn0_pty_mont_384 -.hidden sgn0_pty_mont_384 -.type sgn0_pty_mont_384,\@function,3,"unwind" -.align 32 -sgn0_pty_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $a_ptr, $n_ptr - lea 0($r_ptr), $a_ptr - mov $b_org, $n0 - call __mulq_by_1_mont_384 - - xor %rax, %rax - mov @acc[0], @acc[7] - add @acc[0], @acc[0] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, %rax - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, %rax - - not %rax # 2*x > p, which means "negative" - and \$1, @acc[7] - and \$2, %rax - or @acc[7], %rax # pack sign and parity - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 - -.globl sgn0_pty_mont_384x -.hidden sgn0_pty_mont_384x -.type sgn0_pty_mont_384x,\@function,3,"unwind" -.align 32 -sgn0_pty_mont_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $a_ptr, $n_ptr - lea 48($r_ptr), $a_ptr # sgn0(a->im) - mov $b_org, $n0 - call __mulq_by_1_mont_384 - - mov @acc[0], @acc[6] - or @acc[1], @acc[0] - or @acc[2], @acc[0] - or @acc[3], @acc[0] - or @acc[4], @acc[0] - or @acc[5], @acc[0] - - lea 0($r_ptr), $a_ptr # sgn0(a->re) - xor $r_ptr, $r_ptr - mov @acc[6], @acc[7] - add @acc[6], @acc[6] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, $r_ptr - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $r_ptr - - mov @acc[0], 0(%rsp) # a->im is zero or not - not $r_ptr # 2*x > p, which means "negative" - and \$1, @acc[7] - and \$2, $r_ptr - or @acc[7], $r_ptr # pack sign and parity - - call __mulq_by_1_mont_384 - - mov @acc[0], @acc[6] - or @acc[1], @acc[0] - or @acc[2], @acc[0] - or @acc[3], @acc[0] - or @acc[4], @acc[0] - or @acc[5], @acc[0] - - xor %rax, %rax - mov @acc[6], @acc[7] - add @acc[6], @acc[6] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, %rax - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, %rax - - mov 0(%rsp), @acc[6] - - not %rax # 2*x > p, which means "negative" - - test @acc[0], @acc[0] - cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) - - test @acc[6], @acc[6] - cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) - - and \$1, @acc[7] - and \$2, %rax - or @acc[7], %rax # pack sign and parity - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x -___ -} } - -{ ########################################################## mulq_mont -my ($bi, $hi) = ("%rdi", "%rbp"); - -$code.=<<___; -.globl mul_mont_384 -.hidden mul_mont_384 -.type mul_mont_384,\@function,5,"unwind" -.align 32 -mul_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8*3, %rsp -.cfi_adjust_cfa_offset 8*3 -.cfi_end_prologue - - mov 8*0($b_org), %rax - mov 8*0($a_ptr), @acc[6] - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[4] - mov 8*3($a_ptr), @acc[5] - mov $b_org, $b_ptr # evacuate from %rdx - mov $n0, 8*0(%rsp) - mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 - - call __mulq_mont_384 - - mov 24(%rsp),%r15 -.cfi_restore %r15 - mov 32(%rsp),%r14 -.cfi_restore %r14 - mov 40(%rsp),%r13 -.cfi_restore %r13 - mov 48(%rsp),%r12 -.cfi_restore %r12 - mov 56(%rsp),%rbx -.cfi_restore %rbx - mov 64(%rsp),%rbp -.cfi_restore %rbp - lea 72(%rsp),%rsp -.cfi_adjust_cfa_offset -72 -.cfi_epilogue - ret -.cfi_endproc -.size mul_mont_384,.-mul_mont_384 -___ -{ my @acc=@acc; # will be rotated locally - -$code.=<<___; -.type __mulq_mont_384,\@abi-omnipotent -.align 32 -__mulq_mont_384: - mov %rax, $bi - mulq @acc[6] # a[0]*b[0] - mov %rax, @acc[0] - mov $bi, %rax - mov %rdx, @acc[1] - - mulq @acc[7] # a[1]*b[0] - add %rax, @acc[1] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[2] - - mulq @acc[4] # a[2]*b[0] - add %rax, @acc[2] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[3] - - mov @acc[0], $hi - imulq 8(%rsp), @acc[0] - - mulq @acc[5] # a[3]*b[0] - add %rax, @acc[3] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[4] - - mulq 8*4($a_ptr) - add %rax, @acc[4] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[5] - - mulq 8*5($a_ptr) - add %rax, @acc[5] - mov @acc[0], %rax - adc \$0, %rdx - xor @acc[7], @acc[7] - mov %rdx, @acc[6] -___ -for (my $i=0; $i<6;) { -my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; -$code.=<<___; - ################################# reduction $i - mulq 8*0($n_ptr) - add %rax, $hi # guaranteed to be zero - mov @acc[0], %rax - adc %rdx, $hi - - mulq 8*1($n_ptr) - add %rax, @acc[1] - mov @acc[0], %rax - adc \$0, %rdx - add $hi, @acc[1] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*2($n_ptr) - add %rax, @acc[2] - mov @acc[0], %rax - adc \$0, %rdx - add $hi, @acc[2] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*3($n_ptr) - add $hi, @acc[3] - adc \$0, %rdx - add %rax, @acc[3] - mov @acc[0], %rax - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*4($n_ptr) - add %rax, @acc[4] - mov @acc[0], %rax - adc \$0, %rdx - add $hi, @acc[4] - adc \$0, %rdx - mov %rdx, $hi - - mulq 8*5($n_ptr) - add %rax, @acc[5] - mov $b_next, %rax - adc \$0, %rdx - add $hi, @acc[5] - adc %rdx, @acc[6] - adc \$0, @acc[7] -___ - push(@acc,shift(@acc)); -$code.=<<___ if ($i++<5); - ################################# Multiply by b[$i] - mov %rax, $bi - mulq 8*0($a_ptr) - add %rax, @acc[0] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq 8*1($a_ptr) - add %rax, @acc[1] - mov $bi, %rax - adc \$0, %rdx - add @acc[7], @acc[1] - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq 8*2($a_ptr) - add %rax, @acc[2] - mov $bi, %rax - adc \$0, %rdx - add @acc[7], @acc[2] - adc \$0, %rdx - mov %rdx, @acc[7] - - mov @acc[0], $hi - imulq 8(%rsp), @acc[0] - - mulq 8*3($a_ptr) - add %rax, @acc[3] - mov $bi, %rax - adc \$0, %rdx - add @acc[7], @acc[3] - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq 8*4($a_ptr) - add %rax, @acc[4] - mov $bi, %rax - adc \$0, %rdx - add @acc[7], @acc[4] - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq 8*5($a_ptr) - add @acc[7], @acc[5] - adc \$0, %rdx - xor @acc[7], @acc[7] - add %rax, @acc[5] - mov @acc[0], %rax - adc %rdx, @acc[6] - adc \$0, @acc[7] -___ -} -$code.=<<___; - ################################# - # Branch-less conditional acc[0:6] - modulus - - #mov @acc[0], %rax - mov 8*2(%rsp), $r_ptr # restore $r_ptr - sub 8*0($n_ptr), @acc[0] - mov @acc[1], %rdx - sbb 8*1($n_ptr), @acc[1] - mov @acc[2], $b_ptr - sbb 8*2($n_ptr), @acc[2] - mov @acc[3], $a_ptr - sbb 8*3($n_ptr), @acc[3] - mov @acc[4], $hi - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], @acc[7] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, @acc[6] - - cmovc %rax, @acc[0] - cmovc %rdx, @acc[1] - cmovc $b_ptr, @acc[2] - mov @acc[0], 8*0($r_ptr) - cmovc $a_ptr, @acc[3] - mov @acc[1], 8*1($r_ptr) - cmovc $hi, @acc[4] - mov @acc[2], 8*2($r_ptr) - cmovc @acc[7], @acc[5] - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - ret -.size __mulq_mont_384,.-__mulq_mont_384 -___ -} } -$code.=<<___; -.globl sqr_n_mul_mont_384 -.hidden sqr_n_mul_mont_384 -.type sqr_n_mul_mont_384,\@function,6,"unwind" -.align 32 -sqr_n_mul_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8*17, %rsp -.cfi_adjust_cfa_offset 8*17 -.cfi_end_prologue - - mov $n0, 8*0(%rsp) - mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 - mov $n_ptr, 8*2(%rsp) - lea 8*4(%rsp), $r_ptr - mov %r9, 8*3(%rsp) # 6th, multiplicand argument - movq (%r9), %xmm2 # prefetch b[0] - -.Loop_sqr_384: - movd %edx, %xmm1 # loop counter - - call __sqrq_384 - - lea 0($r_ptr), $a_ptr - mov 8*0(%rsp), %rcx # n0 for mul_by_1 - mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 - call __mulq_by_1_mont_384 - call __redc_tail_mont_384 - - movd %xmm1, %edx - lea 0($r_ptr), $a_ptr - dec %edx - jnz .Loop_sqr_384 - - movq %xmm2, %rax # b[0] - mov $b_ptr, $n_ptr - mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument - - #mov 8*0($b_ptr), %rax - #mov 8*0($a_ptr), @acc[6] - #mov 8*1($a_ptr), @acc[7] - #mov 8*2($a_ptr), @acc[4] - #mov 8*3($a_ptr), @acc[5] - mov @acc[0], @acc[4] - mov @acc[1], @acc[5] - - call __mulq_mont_384 - - lea 8*17(%rsp), %r8 # size optimization - mov 8*17(%rsp), %r15 -.cfi_restore %r15 - mov 8*1(%r8), %r14 -.cfi_restore %r14 - mov 8*2(%r8), %r13 -.cfi_restore %r13 - mov 8*3(%r8), %r12 -.cfi_restore %r12 - mov 8*4(%r8), %rbx -.cfi_restore %rbx - mov 8*5(%r8), %rbp -.cfi_restore %rbp - lea 8*6(%r8), %rsp -.cfi_adjust_cfa_offset -8*23 -.cfi_epilogue - ret -.cfi_endproc -.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 - -.globl sqr_n_mul_mont_383 -.hidden sqr_n_mul_mont_383 -.type sqr_n_mul_mont_383,\@function,6,"unwind" -.align 32 -sqr_n_mul_mont_383: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8*17, %rsp -.cfi_adjust_cfa_offset 8*17 -.cfi_end_prologue - - mov $n0, 8*0(%rsp) - mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 - mov $n_ptr, 8*2(%rsp) - lea 8*4(%rsp), $r_ptr - mov %r9, 8*3(%rsp) # 6th, multiplicand argument - movq (%r9), %xmm2 # prefetch b[0] - -.Loop_sqr_383: - movd %edx, %xmm1 # loop counter - - call __sqrq_384 - - lea 0($r_ptr), $a_ptr - mov 8*0(%rsp), %rcx # n0 for mul_by_1 - mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 - call __mulq_by_1_mont_384 - - movd %xmm1, %edx # loop counter - add 8*6($a_ptr), @acc[6] # just accumulate upper half - adc 8*7($a_ptr), @acc[7] - adc 8*8($a_ptr), @acc[0] - adc 8*9($a_ptr), @acc[1] - adc 8*10($a_ptr), @acc[2] - adc 8*11($a_ptr), @acc[3] - lea 0($r_ptr), $a_ptr - - mov @acc[6], 8*0($r_ptr) # omitting full reduction gives ~5% - mov @acc[7], 8*1($r_ptr) # in addition-chains - mov @acc[0], 8*2($r_ptr) - mov @acc[1], 8*3($r_ptr) - mov @acc[2], 8*4($r_ptr) - mov @acc[3], 8*5($r_ptr) - - dec %edx - jnz .Loop_sqr_383 - - movq %xmm2, %rax # b[0] - mov $b_ptr, $n_ptr - mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument - - #movq 8*0($b_ptr), %rax - #mov 8*0($a_ptr), @acc[6] - #mov 8*1($a_ptr), @acc[7] - #mov 8*2($a_ptr), @acc[4] - #mov 8*3($a_ptr), @acc[5] - mov @acc[0], @acc[4] - mov @acc[1], @acc[5] - - call __mulq_mont_384 # formally one can omit full reduction - # even after multiplication... - lea 8*17(%rsp), %r8 # size optimization - mov 8*17(%rsp), %r15 -.cfi_restore %r15 - mov 8*1(%r8), %r14 -.cfi_restore %r14 - mov 8*2(%r8), %r13 -.cfi_restore %r13 - mov 8*3(%r8), %r12 -.cfi_restore %r12 - mov 8*4(%r8), %rbx -.cfi_restore %rbx - mov 8*5(%r8), %rbp -.cfi_restore %rbp - lea 8*6(%r8), %rsp -.cfi_adjust_cfa_offset -8*23 -.cfi_epilogue - ret -.cfi_endproc -.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 -___ -{ my @acc=@acc; # will be rotated locally - my $bi = "%rbp"; - -$code.=<<___; -.type __mulq_mont_383_nonred,\@abi-omnipotent -.align 32 -__mulq_mont_383_nonred: - mov %rax, $bi - mulq @acc[6] # a[0]*b[0] - mov %rax, @acc[0] - mov $bi, %rax - mov %rdx, @acc[1] - - mulq @acc[7] # a[1]*b[0] - add %rax, @acc[1] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[2] - - mulq @acc[4] # a[2]*b[0] - add %rax, @acc[2] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[3] - - mov @acc[0], @acc[7] - imulq 8(%rsp), @acc[0] - - mulq @acc[5] # a[3]*b[0] - add %rax, @acc[3] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[4] - - mulq 8*4($a_ptr) - add %rax, @acc[4] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[5] - - mulq 8*5($a_ptr) - add %rax, @acc[5] - mov @acc[0], %rax - adc \$0, %rdx - mov %rdx, @acc[6] -___ -for (my $i=0; $i<6;) { -my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; -$code.=<<___; - ################################# reduction $i - mulq 8*0($n_ptr) - add %rax, @acc[7] # guaranteed to be zero - mov @acc[0], %rax - adc %rdx, @acc[7] - - mulq 8*1($n_ptr) - add %rax, @acc[1] - mov @acc[0], %rax - adc \$0, %rdx - add @acc[7], @acc[1] - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq 8*2($n_ptr) - add %rax, @acc[2] - mov @acc[0], %rax - adc \$0, %rdx - add @acc[7], @acc[2] - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq 8*3($n_ptr) - add @acc[7], @acc[3] - adc \$0, %rdx - add %rax, @acc[3] - mov @acc[0], %rax - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq 8*4($n_ptr) - add %rax, @acc[4] - mov @acc[0], %rax - adc \$0, %rdx - add @acc[7], @acc[4] - adc \$0, %rdx - mov %rdx, @acc[7] - - mulq 8*5($n_ptr) - add %rax, @acc[5] - mov $b_next, %rax - adc \$0, %rdx - add @acc[7], @acc[5] - adc %rdx, @acc[6] -___ - push(@acc,shift(@acc)); -$code.=<<___ if ($i++<5); - ################################# Multiply by b[$i] - mov %rax, $bi - mulq 8*0($a_ptr) - add %rax, @acc[0] - mov $bi, %rax - adc \$0, %rdx - mov %rdx, @acc[6] - - mulq 8*1($a_ptr) - add %rax, @acc[1] - mov $bi, %rax - adc \$0, %rdx - add @acc[6], @acc[1] - adc \$0, %rdx - mov %rdx, @acc[6] - - mulq 8*2($a_ptr) - add %rax, @acc[2] - mov $bi, %rax - adc \$0, %rdx - add @acc[6], @acc[2] - adc \$0, %rdx - mov %rdx, @acc[6] - - mov @acc[0], @acc[7] - imulq 8(%rsp), @acc[0] - - mulq 8*3($a_ptr) - add %rax, @acc[3] - mov $bi, %rax - adc \$0, %rdx - add @acc[6], @acc[3] - adc \$0, %rdx - mov %rdx, @acc[6] - - mulq 8*4($a_ptr) - add %rax, @acc[4] - mov $bi, %rax - adc \$0, %rdx - add @acc[6], @acc[4] - adc \$0, %rdx - mov %rdx, @acc[6] - - mulq 8*5($a_ptr) - add @acc[6], @acc[5] - adc \$0, %rdx - add %rax, @acc[5] - mov @acc[0], %rax - adc \$0, %rdx - mov %rdx, @acc[6] -___ -} -$code.=<<___; - ret -.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred -___ -} -{ my $frame = 4*8 + # place for argument off-load + - 2*384/8 + # place for 2 384-bit temporary vectors - 8; # align -my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); - -# omitting 3 reductions gives 8-11% better performance in add-chains -$code.=<<___; -.globl sqr_mont_382x -.hidden sqr_mont_382x -.type sqr_mont_382x,\@function,4,"unwind" -.align 32 -sqr_mont_382x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - mov $n_ptr, 8*0(%rsp) # n0 - mov $b_org, $n_ptr # n_ptr - mov $a_ptr, 8*2(%rsp) - mov $r_ptr, 8*3(%rsp) - - ################################# - mov 8*0($a_ptr), @acc[0] # a->re - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - mov @acc[0], @acc[6] - add 8*6($a_ptr), @acc[0] # a->re + a->im - mov @acc[1], @acc[7] - adc 8*7($a_ptr), @acc[1] - mov @acc[2], @acc[8] - adc 8*8($a_ptr), @acc[2] - mov @acc[3], @acc[9] - adc 8*9($a_ptr), @acc[3] - mov @acc[4], @acc[10] - adc 8*10($a_ptr), @acc[4] - mov @acc[5], @acc[11] - adc 8*11($a_ptr), @acc[5] - - sub 8*6($a_ptr), @acc[6] # a->re - a->im - sbb 8*7($a_ptr), @acc[7] - sbb 8*8($a_ptr), @acc[8] - sbb 8*9($a_ptr), @acc[9] - sbb 8*10($a_ptr), @acc[10] - sbb 8*11($a_ptr), @acc[11] - sbb $r_ptr, $r_ptr # borrow flag as mask - - mov @acc[0], 32+8*0(%rsp) # t0 - mov @acc[1], 32+8*1(%rsp) - mov @acc[2], 32+8*2(%rsp) - mov @acc[3], 32+8*3(%rsp) - mov @acc[4], 32+8*4(%rsp) - mov @acc[5], 32+8*5(%rsp) - - mov @acc[6], 32+8*6(%rsp) # t1 - mov @acc[7], 32+8*7(%rsp) - mov @acc[8], 32+8*8(%rsp) - mov @acc[9], 32+8*9(%rsp) - mov @acc[10], 32+8*10(%rsp) - mov @acc[11], 32+8*11(%rsp) - mov $r_ptr, 32+8*12(%rsp) - - ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); - #mov 8*2(%rsp), $a_ptr # a->re - lea 48($a_ptr), $b_ptr # a->im - - mov 48($a_ptr), %rax # a->im - mov 8*0($a_ptr), @acc[6] # a->re - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[4] - mov 8*3($a_ptr), @acc[5] - - mov 8*3(%rsp), $r_ptr - call __mulq_mont_383_nonred -___ -{ -my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 - 12,13,"ax","bx","bp","si"); -$code.=<<___; - add @acc[0], @acc[0] # add with itself - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - - mov @acc[0], 8*6($r_ptr) # ret->im - mov @acc[1], 8*7($r_ptr) - mov @acc[2], 8*8($r_ptr) - mov @acc[3], 8*9($r_ptr) - mov @acc[4], 8*10($r_ptr) - mov @acc[5], 8*11($r_ptr) -___ -} -$code.=<<___; - ################################# mul_mont_384(ret->re, t0, t1, mod, n0); - lea 32(%rsp), $a_ptr # t0 - lea 32+8*6(%rsp), $b_ptr # t1 - - mov 32+8*6(%rsp), %rax # t1[0] - mov 32+8*0(%rsp), @acc[6] # t0[0..3] - mov 32+8*1(%rsp), @acc[7] - mov 32+8*2(%rsp), @acc[4] - mov 32+8*3(%rsp), @acc[5] - - call __mulq_mont_383_nonred -___ -{ -my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 - 12,13,"ax","bx","bp","si"); -$code.=<<___; - mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im - mov 32+8*0(%rsp), @acc[6] - mov 32+8*1(%rsp), @acc[7] - and @acc[11], @acc[6] - mov 32+8*2(%rsp), @acc[8] - and @acc[11], @acc[7] - mov 32+8*3(%rsp), @acc[9] - and @acc[11], @acc[8] - mov 32+8*4(%rsp), @acc[10] - and @acc[11], @acc[9] - and @acc[11], @acc[10] - and 32+8*5(%rsp), @acc[11] - - sub @acc[6], @acc[0] - mov 8*0($n_ptr), @acc[6] - sbb @acc[7], @acc[1] - mov 8*1($n_ptr), @acc[7] - sbb @acc[8], @acc[2] - mov 8*2($n_ptr), @acc[8] - sbb @acc[9], @acc[3] - mov 8*3($n_ptr), @acc[9] - sbb @acc[10], @acc[4] - mov 8*4($n_ptr), @acc[10] - sbb @acc[11], @acc[5] - sbb @acc[11], @acc[11] - - and @acc[11], @acc[6] - and @acc[11], @acc[7] - and @acc[11], @acc[8] - and @acc[11], @acc[9] - and @acc[11], @acc[10] - and 8*5($n_ptr), @acc[11] - - add @acc[6], @acc[0] - adc @acc[7], @acc[1] - adc @acc[8], @acc[2] - adc @acc[9], @acc[3] - adc @acc[10], @acc[4] - adc @acc[11], @acc[5] - - mov @acc[0], 8*0($r_ptr) # ret->re - mov @acc[1], 8*1($r_ptr) - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) -___ -} -$code.=<<___; - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size sqr_mont_382x,.-sqr_mont_382x -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/mulx_mont_256-x86_64.pl b/crypto/blst_src/asm/mulx_mont_256-x86_64.pl deleted file mode 100755 index 0d6bf2e465c..00000000000 --- a/crypto/blst_src/asm/mulx_mont_256-x86_64.pl +++ /dev/null @@ -1,486 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# "Sparse" in subroutine names refers to most significant limb of the -# modulus. Though "sparse" is a bit of misnomer, because limitation is -# just not-all-ones. Or in other words not larger than 2^256-2^192-1. -# In general Montgomery multiplication algorithm can handle one of the -# inputs being non-reduced and capped by 1<re, b->re); - #lea 0($b_btr), $b_ptr # b->re - #lea 0($a_ptr), $a_ptr # a->re - lea 40(%rsp), $r_ptr # t0 - call __mulx_384 - - ################################# mul_384(t1, a->im, b->im); - lea 48($b_ptr), $b_ptr # b->im - lea 128+48($a_ptr), $a_ptr # a->im - lea 96($r_ptr), $r_ptr # t1 - call __mulx_384 - - ################################# mul_384(t2, a->re+a->im, b->re+b->im); - mov 8*1(%rsp), $n_ptr - lea ($b_ptr), $a_ptr # b->re - lea -48($b_ptr), $b_org # b->im - lea 40+192+48(%rsp), $r_ptr - call __add_mod_384 - - mov 8*3(%rsp), $a_ptr # a->re - lea 48($a_ptr), $b_org # a->im - lea -48($r_ptr), $r_ptr - call __add_mod_384 - - lea ($r_ptr),$b_ptr - lea 48($r_ptr),$a_ptr - call __mulx_384 - - ################################# t2=t2-t0-t1 - lea ($r_ptr), $a_ptr # t2 - lea 40(%rsp), $b_org # t0 - mov 8*1(%rsp), $n_ptr - call __sub_mod_384x384 # t2-t0 - - lea ($r_ptr), $a_ptr # t2 - lea -96($r_ptr), $b_org # t1 - call __sub_mod_384x384 # t2-t0-t1 - - ################################# t0=t0-t1 - lea 40(%rsp), $a_ptr - lea 40+96(%rsp), $b_org - lea 40(%rsp), $r_ptr - call __sub_mod_384x384 # t0-t1 - - lea ($n_ptr), $b_ptr # n_ptr for redc_mont_384 - - ################################# redc_mont_384(ret->re, t0, mod, n0); - lea 40(%rsp), $a_ptr # t0 - mov 8*0(%rsp), %rcx # n0 for redc_mont_384 - mov 8*4(%rsp), $r_ptr # ret->re - call __mulx_by_1_mont_384 - call __redc_tail_mont_384 - - ################################# redc_mont_384(ret->im, t2, mod, n0); - lea 40+192(%rsp), $a_ptr # t2 - mov 8*0(%rsp), %rcx # n0 for redc_mont_384 - lea 48($r_ptr), $r_ptr # ret->im - call __mulx_by_1_mont_384 - call __redc_tail_mont_384 - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size mulx_mont_384x,.-mulx_mont_384x -___ -} -{ my $frame = 4*8 + # place for argument off-load + - 2*384/8 + # place for 2 384-bit temporary vectors - 8; # alignment -$code.=<<___; -.globl sqrx_mont_384x -.hidden sqrx_mont_384x -.type sqrx_mont_384x,\@function,4,"unwind" -.align 32 -sqrx_mont_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - mov $n_ptr, 8*0(%rsp) # n0 - mov $b_org, $n_ptr # n_ptr - # gap for __mulx_mont_384 - mov $r_ptr, 8*2(%rsp) - mov $a_ptr, 8*3(%rsp) - - ################################# add_mod_384(t0, a->re, a->im); - lea 48($a_ptr), $b_org # a->im - lea 32(%rsp), $r_ptr # t0 - call __add_mod_384 - - ################################# sub_mod_384(t1, a->re, a->im); - mov 8*3(%rsp), $a_ptr # a->re - lea 48($a_ptr), $b_org # a->im - lea 32+48(%rsp), $r_ptr # t1 - call __sub_mod_384 - - ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); - mov 8*3(%rsp), $a_ptr # a->re - lea 48($a_ptr), $b_ptr # a->im - - mov 48($a_ptr), %rdx - mov 8*0($a_ptr), %r14 # @acc[6] - mov 8*1($a_ptr), %r15 # @acc[7] - mov 8*2($a_ptr), %rax # @acc[8] - mov 8*3($a_ptr), %r12 # @acc[4] - mov 8*4($a_ptr), %rdi # $lo - mov 8*5($a_ptr), %rbp # $hi - lea -128($a_ptr), $a_ptr # control u-op density - lea -128($n_ptr), $n_ptr # control u-op density - - mulx %r14, %r8, %r9 - call __mulx_mont_384 -___ -{ -my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 - 8..11,13,14); -$code.=<<___; - add @acc[0], @acc[0] # add with itself - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - mov @acc[0], @acc[6] - adc @acc[3], @acc[3] - mov @acc[1], @acc[7] - adc @acc[4], @acc[4] - mov @acc[2], @acc[8] - adc @acc[5], @acc[5] - mov @acc[3], @acc[9] - sbb $a_ptr, $a_ptr - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - mov @acc[4], @acc[10] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], @acc[11] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $a_ptr - - cmovc @acc[6], @acc[0] - cmovc @acc[7], @acc[1] - cmovc @acc[8], @acc[2] - mov @acc[0], 8*6($b_ptr) # ret->im - cmovc @acc[9], @acc[3] - mov @acc[1], 8*7($b_ptr) - cmovc @acc[10], @acc[4] - mov @acc[2], 8*8($b_ptr) - cmovc @acc[11], @acc[5] - mov @acc[3], 8*9($b_ptr) - mov @acc[4], 8*10($b_ptr) - mov @acc[5], 8*11($b_ptr) -___ -} -$code.=<<___; - ################################# mul_mont_384(ret->re, t0, t1, mod, n0); - lea 32(%rsp), $a_ptr # t0 - lea 32+48(%rsp), $b_ptr # t1 - - mov 32+48(%rsp), %rdx # t1[0] - mov 32+8*0(%rsp), %r14 # @acc[6] - mov 32+8*1(%rsp), %r15 # @acc[7] - mov 32+8*2(%rsp), %rax # @acc[8] - mov 32+8*3(%rsp), %r12 # @acc[4] - mov 32+8*4(%rsp), %rdi # $lo - mov 32+8*5(%rsp), %rbp # $hi - lea -128($a_ptr), $a_ptr # control u-op density - lea -128($n_ptr), $n_ptr # control u-op density - - mulx %r14, %r8, %r9 - call __mulx_mont_384 - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size sqrx_mont_384x,.-sqrx_mont_384x - -.globl mulx_382x -.hidden mulx_382x -.type mulx_382x,\@function,4,"unwind" -.align 32 -mulx_382x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - lea 96($r_ptr), $r_ptr # ret->im - mov $a_ptr, 8*0(%rsp) - mov $b_org, 8*1(%rsp) - mov $r_ptr, 8*2(%rsp) # offload ret->im - mov $n_ptr, 8*3(%rsp) - - ################################# t0 = a->re + a->im - mov 8*0($a_ptr), @acc[0] - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - add 8*6($a_ptr), @acc[0] - adc 8*7($a_ptr), @acc[1] - adc 8*8($a_ptr), @acc[2] - adc 8*9($a_ptr), @acc[3] - adc 8*10($a_ptr), @acc[4] - adc 8*11($a_ptr), @acc[5] - - mov @acc[0], 32+8*0(%rsp) - mov @acc[1], 32+8*1(%rsp) - mov @acc[2], 32+8*2(%rsp) - mov @acc[3], 32+8*3(%rsp) - mov @acc[4], 32+8*4(%rsp) - mov @acc[5], 32+8*5(%rsp) - - ################################# t1 = b->re + b->im - mov 8*0($b_org), @acc[0] - mov 8*1($b_org), @acc[1] - mov 8*2($b_org), @acc[2] - mov 8*3($b_org), @acc[3] - mov 8*4($b_org), @acc[4] - mov 8*5($b_org), @acc[5] - - add 8*6($b_org), @acc[0] - adc 8*7($b_org), @acc[1] - adc 8*8($b_org), @acc[2] - adc 8*9($b_org), @acc[3] - adc 8*10($b_org), @acc[4] - adc 8*11($b_org), @acc[5] - - mov @acc[0], 32+8*6(%rsp) - mov @acc[1], 32+8*7(%rsp) - mov @acc[2], 32+8*8(%rsp) - mov @acc[3], 32+8*9(%rsp) - mov @acc[4], 32+8*10(%rsp) - mov @acc[5], 32+8*11(%rsp) - - ################################# mul_384(ret->im, t0, t1); - lea 32+8*0(%rsp), $a_ptr # t0 - lea 32+8*6(%rsp), $b_ptr # t1 - call __mulx_384 - - ################################# mul_384(ret->re, a->re, b->re); - mov 8*0(%rsp), $a_ptr - mov 8*1(%rsp), $b_ptr - lea -96($r_ptr), $r_ptr # ret->re - call __mulx_384 - - ################################# mul_384(tx, a->im, b->im); - lea 48+128($a_ptr), $a_ptr - lea 48($b_ptr), $b_ptr - lea 32(%rsp), $r_ptr - call __mulx_384 - - ################################# ret->im -= tx - mov 8*2(%rsp), $a_ptr # restore ret->im - lea 32(%rsp), $b_org - mov 8*3(%rsp), $n_ptr - mov $a_ptr, $r_ptr - call __sub_mod_384x384 - - ################################# ret->im -= ret->re - lea 0($r_ptr), $a_ptr - lea -96($r_ptr), $b_org - call __sub_mod_384x384 - - ################################# ret->re -= tx - lea -96($r_ptr), $a_ptr - lea 32(%rsp), $b_org - lea -96($r_ptr), $r_ptr - call __sub_mod_384x384 - - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size mulx_382x,.-mulx_382x -___ -} -{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected - # except for $n_ptr and $r_ptr -$code.=<<___; -.globl sqrx_382x -.hidden sqrx_382x -.type sqrx_382x,\@function,3,"unwind" -.align 32 -sqrx_382x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $a_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $b_org, $n_ptr - - ################################# t0 = a->re + a->im - mov 8*0($a_ptr), @acc[6] - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov 8*3($a_ptr), @acc[9] - mov 8*4($a_ptr), @acc[10] - mov 8*5($a_ptr), @acc[11] - - mov @acc[6], @acc[0] - add 8*6($a_ptr), @acc[6] - mov @acc[7], @acc[1] - adc 8*7($a_ptr), @acc[7] - mov @acc[8], @acc[2] - adc 8*8($a_ptr), @acc[8] - mov @acc[9], @acc[3] - adc 8*9($a_ptr), @acc[9] - mov @acc[10], @acc[4] - adc 8*10($a_ptr), @acc[10] - mov @acc[11], @acc[5] - adc 8*11($a_ptr), @acc[11] - - mov @acc[6], 8*0($r_ptr) - mov @acc[7], 8*1($r_ptr) - mov @acc[8], 8*2($r_ptr) - mov @acc[9], 8*3($r_ptr) - mov @acc[10], 8*4($r_ptr) - mov @acc[11], 8*5($r_ptr) - - ################################# t1 = a->re - a->im - lea 48($a_ptr), $b_org - lea 48($r_ptr), $r_ptr - call __sub_mod_384_a_is_loaded - - ################################# mul_384(ret->re, t0, t1); - lea ($r_ptr), $a_ptr - lea -48($r_ptr), $b_ptr - lea -48($r_ptr), $r_ptr - call __mulx_384 - - ################################# mul_384(ret->im, a->re, a->im); - mov (%rsp), $a_ptr - lea 48($a_ptr), $b_ptr - lea 96($r_ptr), $r_ptr - call __mulx_384 - - mov 8*0($r_ptr), @acc[0] # double ret->im - mov 8*1($r_ptr), @acc[1] - mov 8*2($r_ptr), @acc[2] - mov 8*3($r_ptr), @acc[3] - mov 8*4($r_ptr), @acc[4] - mov 8*5($r_ptr), @acc[5] - mov 8*6($r_ptr), @acc[6] - mov 8*7($r_ptr), @acc[7] - mov 8*8($r_ptr), @acc[8] - mov 8*9($r_ptr), @acc[9] - mov 8*10($r_ptr), @acc[10] - add @acc[0], @acc[0] - mov 8*11($r_ptr), @acc[11] - adc @acc[1], @acc[1] - mov @acc[0], 8*0($r_ptr) - adc @acc[2], @acc[2] - mov @acc[1], 8*1($r_ptr) - adc @acc[3], @acc[3] - mov @acc[2], 8*2($r_ptr) - adc @acc[4], @acc[4] - mov @acc[3], 8*3($r_ptr) - adc @acc[5], @acc[5] - mov @acc[4], 8*4($r_ptr) - adc @acc[6], @acc[6] - mov @acc[5], 8*5($r_ptr) - adc @acc[7], @acc[7] - mov @acc[6], 8*6($r_ptr) - adc @acc[8], @acc[8] - mov @acc[7], 8*7($r_ptr) - adc @acc[9], @acc[9] - mov @acc[8], 8*8($r_ptr) - adc @acc[10], @acc[10] - mov @acc[9], 8*9($r_ptr) - adc @acc[11], @acc[11] - mov @acc[10], 8*10($r_ptr) - mov @acc[11], 8*11($r_ptr) - - mov 8*1(%rsp),%r15 -.cfi_restore %r15 - mov 8*2(%rsp),%r14 -.cfi_restore %r14 - mov 8*3(%rsp),%r13 -.cfi_restore %r13 - mov 8*4(%rsp),%r12 -.cfi_restore %r12 - mov 8*5(%rsp),%rbx -.cfi_restore %rbx - mov 8*6(%rsp),%rbp -.cfi_restore %rbp - lea 8*7(%rsp),%rsp -.cfi_adjust_cfa_offset -8*7 -.cfi_epilogue - ret -.cfi_endproc -.size sqrx_382x,.-sqrx_382x -___ -} -{ ########################################################## 384-bit mulx -my ($a0, $a1) = @acc[6..7]; -my @acc = @acc[0..5]; -my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp"); - -$code.=<<___; -.globl mulx_384 -.hidden mulx_384 -.type mulx_384,\@function,3,"unwind" -.align 32 -mulx_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.cfi_end_prologue - - mov $b_org, $b_ptr # evacuate from %rdx - call __mulx_384 - - mov 0(%rsp),%r15 -.cfi_restore %r15 - mov 8(%rsp),%r14 -.cfi_restore %r14 - mov 16(%rsp),%r13 -.cfi_restore %r13 - mov 24(%rsp),%r12 -.cfi_restore %r12 - mov 32(%rsp),%rbx -.cfi_restore %rbx - mov 40(%rsp),%rbp -.cfi_restore %rbp - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.cfi_epilogue - ret -.cfi_endproc -.size mulx_384,.-mulx_384 - -.type __mulx_384,\@abi-omnipotent -.align 32 -__mulx_384: - mov 8*0($b_ptr), %rdx - mov 8*0($a_ptr), $a0 - mov 8*1($a_ptr), $a1 - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - lea -128($a_ptr), $a_ptr - - mulx $a0, @acc[1], $hi - xor $zr, $zr - - mulx $a1, @acc[0], $lo - adcx $hi, @acc[0] - mov @acc[1], 8*0($r_ptr) - - mulx @acc[2], @acc[1], $hi - adcx $lo, @acc[1] - - mulx @acc[3], @acc[2], $lo - adcx $hi, @acc[2] - - mulx @acc[4], @acc[3], $hi - adcx $lo, @acc[3] - - mulx @acc[5], @acc[4], @acc[5] - mov 8*1($b_ptr), %rdx - adcx $hi, @acc[4] - adcx $zr, @acc[5] -___ -for(my $i=1; $i<6; $i++) { -my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; -$code.=<<___; - mulx $a0, $lo, $hi - adcx @acc[0], $lo - adox $hi, @acc[1] - mov $lo, 8*$i($r_ptr) - - mulx $a1, @acc[0], $hi - adcx @acc[1], $acc[0] - adox $hi, @acc[2] - - mulx 128+8*2($a_ptr), @acc[1], $lo - adcx @acc[2], @acc[1] - adox $lo, @acc[3] - - mulx 128+8*3($a_ptr), @acc[2], $hi - adcx @acc[3], @acc[2] - adox $hi, @acc[4] - - mulx 128+8*4($a_ptr), @acc[3], $lo - adcx @acc[4], @acc[3] - adox @acc[5], $lo - - mulx 128+8*5($a_ptr), @acc[4], @acc[5] - mov $b_next, %rdx - adcx $lo, @acc[4] - adox $zr, @acc[5] - adcx $zr, @acc[5] -___ -} -$code.=<<___; - mov @acc[0], 8*6($r_ptr) - mov @acc[1], 8*7($r_ptr) - mov @acc[2], 8*8($r_ptr) - mov @acc[3], 8*9($r_ptr) - mov @acc[4], 8*10($r_ptr) - mov @acc[5], 8*11($r_ptr) - - ret -.size __mulx_384,.-__mulx_384 -___ -} -{ ########################################################## 384-bit sqrx -$code.=<<___; -.globl sqrx_384 -.hidden sqrx_384 -.type sqrx_384,\@function,2,"unwind" -.align 32 -sqrx_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $r_ptr -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - call __sqrx_384 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sqrx_384,.-sqrx_384 -___ -if (0) { -# up to 5% slower than below variant -my @acc=map("%r$_",("no",8..15,"cx","bx")); - push(@acc, $a_ptr); -my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno"); - -$code.=<<___; -.type __sqrx_384,\@abi-omnipotent -.align 32 -__sqrx_384: - mov 8*0($a_ptr), %rdx - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov 8*3($a_ptr), @acc[9] - mov 8*4($a_ptr), @acc[10] - - ######################################### - mulx @acc[7], @acc[1], $lo # a[1]*a[0] - mov 8*5($a_ptr), @acc[11] - mulx @acc[8], @acc[2], $hi # a[2]*a[0] - add $lo, @acc[2] - mulx @acc[9], @acc[3], $lo # a[3]*a[0] - adc $hi, @acc[3] - mulx @acc[10], @acc[4], $hi # a[4]*a[0] - adc $lo, @acc[4] - mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] - adc $hi, @acc[5] - adc \$0, @acc[6] - - mulx %rdx, $lo, $hi # a[0]*a[0] - mov @acc[7], %rdx - xor @acc[7], @acc[7] - add @acc[1], @acc[1] # double acc[1] - adc \$0, @acc[7] - add $hi, @acc[1] - adc \$0, @acc[7] - mov $lo, 8*0($r_ptr) - mov @acc[1], 8*1($r_ptr) -___ -($carry, @acc[7]) = (@acc[7], @acc[1]); -$code.=<<___; - ######################################### - xor @acc[7], @acc[7] - mulx @acc[8], $lo, $hi # a[2]*a[1] - adcx $lo, @acc[3] - adox $hi, @acc[4] - - mulx @acc[9], $lo, $hi # a[3]*a[1] - adcx $lo, @acc[4] - adox $hi, @acc[5] - - mulx @acc[10], $lo, $hi # a[4]*a[1] - adcx $lo, @acc[5] - adox $hi, @acc[6] - - mulx @acc[11], $lo, $hi # a[5]*a[1] - adcx $lo, @acc[6] - adox @acc[7], $hi - adcx $hi, @acc[7] - - mulx %rdx, $lo, $hi # a[1]*a[1] - mov @acc[8], %rdx - xor @acc[8], @acc[8] - adox @acc[2], @acc[2] # double acc[2:3] - adcx $carry, $lo # can't carry - adox @acc[3], @acc[3] - adcx $lo, @acc[2] - adox @acc[8], @acc[8] - adcx $hi, @acc[3] - adc \$0, @acc[8] - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) -___ -($carry,@acc[8])=(@acc[8],$carry); -$code.=<<___; - ######################################### - xor @acc[8], @acc[8] - mulx @acc[9], $lo, $hi # a[3]*a[2] - adcx $lo, @acc[5] - adox $hi, @acc[6] - - mulx @acc[10], $lo, $hi # a[4]*a[2] - adcx $lo, @acc[6] - adox $hi, @acc[7] - - mulx @acc[11], $lo, $hi # a[5]*a[2] - adcx $lo, @acc[7] - adox @acc[8], $hi - adcx $hi, @acc[8] - - mulx %rdx, $lo, $hi # a[2]*a[2] - mov @acc[9], %rdx - xor @acc[9], @acc[9] - adox @acc[4], @acc[4] # double acc[4:5] - adcx $carry, $lo # can't carry - adox @acc[5], @acc[5] - adcx $lo, @acc[4] - adox @acc[9], @acc[9] - adcx $hi, @acc[5] - adc \$0, $acc[9] - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) -___ -($carry,@acc[9])=(@acc[9],$carry); -$code.=<<___; - ######################################### - xor @acc[9], @acc[9] - mulx @acc[10], $lo, $hi # a[4]*a[3] - adcx $lo, @acc[7] - adox $hi, @acc[8] - - mulx @acc[11], $lo, $hi # a[5]*a[3] - adcx $lo, @acc[8] - adox @acc[9], $hi - adcx $hi, @acc[9] - - mulx %rdx, $lo, $hi - mov @acc[10], %rdx - xor @acc[10], @acc[10] - adox @acc[6], @acc[6] # double acc[6:7] - adcx $carry, $lo # can't carry - adox @acc[7], @acc[7] - adcx $lo, @acc[6] - adox @acc[10], @acc[10] - adcx $hi, @acc[7] - adc \$0, $acc[10] - mov @acc[6], 8*6($r_ptr) - mov @acc[7], 8*7($r_ptr) -___ -($carry,@acc[10])=(@acc[10],$carry); -$code.=<<___; - ######################################### - mulx @acc[11], $lo, @acc[10] # a[5]*a[4] - add $lo, @acc[9] - adc \$0, @acc[10] - - mulx %rdx, $lo, $hi # a[4]*a[4] - mov @acc[11], %rdx - xor @acc[11], @acc[11] - adox @acc[8], @acc[8] # double acc[8:10] - adcx $carry, $lo # can't carry - adox @acc[9], @acc[9] - adcx $lo, @acc[8] - adox @acc[10], @acc[10] - adcx $hi, @acc[9] - adox @acc[11], @acc[11] - mov @acc[8], 8*8($r_ptr) - mov @acc[9], 8*9($r_ptr) - - ######################################### - mulx %rdx, $lo, $hi # a[5]*a[5] - adcx $lo, @acc[10] - adcx $hi, @acc[11] - - mov @acc[10], 8*10($r_ptr) - mov @acc[11], 8*11($r_ptr) - - ret -.size __sqrx_384,.-__sqrx_384 -___ -} else { -my @acc=map("%r$_",("no",8..15,"cx","bx","bp")); -my ($lo, $hi)=($r_ptr, "%rax"); - -$code.=<<___; -.type __sqrx_384,\@abi-omnipotent -.align 32 -__sqrx_384: - mov 8*0($a_ptr), %rdx - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov 8*3($a_ptr), @acc[9] - mov 8*4($a_ptr), @acc[10] - - ######################################### - mulx @acc[7], @acc[1], $lo # a[1]*a[0] - mov 8*5($a_ptr), @acc[11] - mulx @acc[8], @acc[2], $hi # a[2]*a[0] - add $lo, @acc[2] - mulx @acc[9], @acc[3], $lo # a[3]*a[0] - adc $hi, @acc[3] - mulx @acc[10], @acc[4], $hi # a[4]*a[0] - adc $lo, @acc[4] - mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] - mov @acc[7], %rdx - adc $hi, @acc[5] - adc \$0, @acc[6] - - ######################################### - xor @acc[7], @acc[7] - mulx @acc[8], $lo, $hi # a[2]*a[1] - adcx $lo, @acc[3] - adox $hi, @acc[4] - - mulx @acc[9], $lo, $hi # a[3]*a[1] - adcx $lo, @acc[4] - adox $hi, @acc[5] - - mulx @acc[10], $lo, $hi # a[4]*a[1] - adcx $lo, @acc[5] - adox $hi, @acc[6] - - mulx @acc[11], $lo, $hi # a[5]*a[1] - mov @acc[8], %rdx - adcx $lo, @acc[6] - adox @acc[7], $hi - adcx $hi, @acc[7] - - ######################################### - xor @acc[8], @acc[8] - mulx @acc[9], $lo, $hi # a[3]*a[2] - adcx $lo, @acc[5] - adox $hi, @acc[6] - - mulx @acc[10], $lo, $hi # a[4]*a[2] - adcx $lo, @acc[6] - adox $hi, @acc[7] - - mulx @acc[11], $lo, $hi # a[5]*a[2] - mov @acc[9], %rdx - adcx $lo, @acc[7] - adox @acc[8], $hi - adcx $hi, @acc[8] - - ######################################### - xor @acc[9], @acc[9] - mulx @acc[10], $lo, $hi # a[4]*a[3] - adcx $lo, @acc[7] - adox $hi, @acc[8] - - mulx @acc[11], $lo, $hi # a[5]*a[3] - mov @acc[10], %rdx - adcx $lo, @acc[8] - adox @acc[9], $hi - adcx $hi, @acc[9] - - ######################################### - mulx @acc[11], $lo, @acc[10] # a[5]*a[4] - mov 8*0($a_ptr), %rdx - add $lo, @acc[9] - mov 8(%rsp), $r_ptr # restore $r_ptr - adc \$0, @acc[10] - - ######################################### double acc[1:10] - xor @acc[11], @acc[11] - adcx @acc[1], @acc[1] - adcx @acc[2], @acc[2] - adcx @acc[3], @acc[3] - adcx @acc[4], @acc[4] - adcx @acc[5], @acc[5] - - ######################################### accumulate a[i]*a[i] - mulx %rdx, %rdx, $hi # a[0]*a[0] - mov %rdx, 8*0($r_ptr) - mov 8*1($a_ptr), %rdx - adox $hi, @acc[1] - mov @acc[1], 8*1($r_ptr) - - mulx %rdx, @acc[1], $hi # a[1]*a[1] - mov 8*2($a_ptr), %rdx - adox @acc[1], @acc[2] - adox $hi, @acc[3] - mov @acc[2], 8*2($r_ptr) - mov @acc[3], 8*3($r_ptr) - - mulx %rdx, @acc[1], @acc[2] # a[2]*a[2] - mov 8*3($a_ptr), %rdx - adox @acc[1], @acc[4] - adox @acc[2], @acc[5] - adcx @acc[6], @acc[6] - adcx @acc[7], @acc[7] - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - mulx %rdx, @acc[1], @acc[2] # a[3]*a[3] - mov 8*4($a_ptr), %rdx - adox @acc[1], @acc[6] - adox @acc[2], @acc[7] - adcx @acc[8], @acc[8] - adcx @acc[9], @acc[9] - mov @acc[6], 8*6($r_ptr) - mov @acc[7], 8*7($r_ptr) - - mulx %rdx, @acc[1], @acc[2] # a[4]*a[4] - mov 8*5($a_ptr), %rdx - adox @acc[1], @acc[8] - adox @acc[2], @acc[9] - adcx @acc[10], @acc[10] - adcx @acc[11], @acc[11] - mov @acc[8], 8*8($r_ptr) - mov @acc[9], 8*9($r_ptr) - - mulx %rdx, @acc[1], @acc[2] # a[5]*a[5] - adox @acc[1], @acc[10] - adox @acc[2], @acc[11] - - mov @acc[10], 8*10($r_ptr) - mov @acc[11], 8*11($r_ptr) - - ret -.size __sqrx_384,.-__sqrx_384 -___ -} - -{ ########################################################## 384-bit redcx_mont -my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" -my ($lo, $hi) = ("%rax", "%rbp"); - -$code.=<<___; -######################################################################## -# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12], -# uint64_t m[6], uint64_t n0); -.globl redcx_mont_384 -.hidden redcx_mont_384 -.type redcx_mont_384,\@function,4,"unwind" -.align 32 -redcx_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $b_org, $n_ptr - call __mulx_by_1_mont_384 - call __redc_tail_mont_384 - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size redcx_mont_384,.-redcx_mont_384 - -######################################################################## -# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6], -# uint64_t m[6], uint64_t n0); -.globl fromx_mont_384 -.hidden fromx_mont_384 -.type fromx_mont_384,\@function,4,"unwind" -.align 32 -fromx_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $b_org, $n_ptr - call __mulx_by_1_mont_384 - - ################################# - # Branch-less conditional acc[0:6] - modulus - - mov @acc[6], %rax - mov @acc[7], %rcx - mov @acc[0], %rdx - mov @acc[1], %rbp - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[7] - mov @acc[2], @acc[5] - sbb 8*2($n_ptr), @acc[0] - sbb 8*3($n_ptr), @acc[1] - sbb 8*4($n_ptr), @acc[2] - mov @acc[3], $a_ptr - sbb 8*5($n_ptr), @acc[3] - - cmovc %rax, @acc[6] - cmovc %rcx, @acc[7] - cmovc %rdx, @acc[0] - mov @acc[6], 8*0($r_ptr) - cmovc %rbp, @acc[1] - mov @acc[7], 8*1($r_ptr) - cmovc @acc[5], @acc[2] - mov @acc[0], 8*2($r_ptr) - cmovc $a_ptr, @acc[3] - mov @acc[1], 8*3($r_ptr) - mov @acc[2], 8*4($r_ptr) - mov @acc[3], 8*5($r_ptr) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size fromx_mont_384,.-fromx_mont_384 -___ -{ my @acc=@acc; # will be rotated locally - -$code.=<<___; -.type __mulx_by_1_mont_384,\@abi-omnipotent -.align 32 -__mulx_by_1_mont_384: - mov 8*0($a_ptr), @acc[0] - mov $n0, %rdx - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] -___ -for (my $i=0; $i<6; $i++) { -$code.=<<___; - imulq @acc[0], %rdx - - ################################# reduction $i - xor @acc[6], @acc[6] # @acc[6]=0, cf=0, of=0 - mulx 8*0($n_ptr), $lo, $hi - adcx $lo, @acc[0] # guaranteed to be zero - adox $hi, @acc[1] - - mulx 8*1($n_ptr), $lo, $hi - adcx $lo, @acc[1] - adox $hi, @acc[2] - - mulx 8*2($n_ptr), $lo, $hi - adcx $lo, @acc[2] - adox $hi, @acc[3] - - mulx 8*3($n_ptr), $lo, $hi - adcx $lo, @acc[3] - adox $hi, @acc[4] - - mulx 8*4($n_ptr), $lo, $hi - adcx $lo, @acc[4] - adox $hi, @acc[5] - - mulx 8*5($n_ptr), $lo, $hi - mov $n0, %rdx - adcx $lo, @acc[5] - adox @acc[6], $hi - adcx $hi, @acc[6] -___ - push(@acc,shift(@acc)); -} -$code.=<<___; - ret -.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 - -.type __redc_tail_mont_384,\@abi-omnipotent -.align 32 -__redc_tail_mont_384: - add 8*6($a_ptr), @acc[0] # accumulate upper half - mov @acc[0], %rax - adc 8*7($a_ptr), @acc[1] - adc 8*8($a_ptr), @acc[2] - adc 8*9($a_ptr), @acc[3] - mov @acc[1], %rcx - adc 8*10($a_ptr), @acc[4] - adc 8*11($a_ptr), @acc[5] - sbb @acc[6], @acc[6] - - ################################# - # Branch-less conditional acc[0:6] - modulus - - mov @acc[2], %rdx - mov @acc[3], %rbp - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - mov @acc[4], @acc[7] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - mov @acc[5], $a_ptr - sbb 8*5($n_ptr), @acc[5] - sbb \$0, @acc[6] - - cmovc %rax, @acc[0] - cmovc %rcx, @acc[1] - cmovc %rdx, @acc[2] - mov @acc[0], 8*0($r_ptr) - cmovc %rbp, @acc[3] - mov @acc[1], 8*1($r_ptr) - cmovc @acc[7], @acc[4] - mov @acc[2], 8*2($r_ptr) - cmovc $a_ptr, @acc[5] - mov @acc[3], 8*3($r_ptr) - mov @acc[4], 8*4($r_ptr) - mov @acc[5], 8*5($r_ptr) - - ret -.size __redc_tail_mont_384,.-__redc_tail_mont_384 - -.globl sgn0x_pty_mont_384 -.hidden sgn0x_pty_mont_384 -.type sgn0x_pty_mont_384,\@function,3,"unwind" -.align 32 -sgn0x_pty_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $a_ptr, $n_ptr - lea 0($r_ptr), $a_ptr - mov $b_org, $n0 - call __mulx_by_1_mont_384 - - xor %rax, %rax - mov @acc[0], @acc[7] - add @acc[0], @acc[0] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, %rax - - sub 8*0($n_ptr), @acc[0] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, %rax - - not %rax # 2*x > p, which means "negative" - and \$1, @acc[7] - and \$2, %rax - or @acc[7], %rax # pack sign and parity - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 - -.globl sgn0x_pty_mont_384x -.hidden sgn0x_pty_mont_384x -.type sgn0x_pty_mont_384x,\@function,3,"unwind" -.align 32 -sgn0x_pty_mont_384x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$8, %rsp -.cfi_adjust_cfa_offset 8 -.cfi_end_prologue - - mov $a_ptr, $n_ptr - lea 48($r_ptr), $a_ptr # sgn0(a->im) - mov $b_org, $n0 - call __mulx_by_1_mont_384 - - mov @acc[0], @acc[6] - or @acc[1], @acc[0] - or @acc[2], @acc[0] - or @acc[3], @acc[0] - or @acc[4], @acc[0] - or @acc[5], @acc[0] - - lea 0($r_ptr), $a_ptr # sgn0(a->re) - xor $r_ptr, $r_ptr - mov @acc[6], @acc[7] - add @acc[6], @acc[6] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, $r_ptr - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, $r_ptr - - mov @acc[0], 0(%rsp) # a->im is zero or not - not $r_ptr # 2*x > p, which means "negative" - and \$1, @acc[7] - and \$2, $r_ptr - or @acc[7], $r_ptr # pack sign and parity - - call __mulx_by_1_mont_384 - - mov @acc[0], @acc[6] - or @acc[1], @acc[0] - or @acc[2], @acc[0] - or @acc[3], @acc[0] - or @acc[4], @acc[0] - or @acc[5], @acc[0] - - xor %rax, %rax - mov @acc[6], @acc[7] - add @acc[6], @acc[6] - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - adc \$0, %rax - - sub 8*0($n_ptr), @acc[6] - sbb 8*1($n_ptr), @acc[1] - sbb 8*2($n_ptr), @acc[2] - sbb 8*3($n_ptr), @acc[3] - sbb 8*4($n_ptr), @acc[4] - sbb 8*5($n_ptr), @acc[5] - sbb \$0, %rax - - mov 0(%rsp), @acc[6] - - not %rax # 2*x > p, which means "negative" - - test @acc[0], @acc[0] - cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) - - test @acc[6], @acc[6] - cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) - - and \$1, @acc[7] - and \$2, %rax - or @acc[7], %rax # pack sign and parity - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - mov 48(%rsp),%rbp -.cfi_restore %rbp - lea 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 -.cfi_epilogue - ret -.cfi_endproc -.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x -___ -} } - -{ ########################################################## mulx/sqrx_mont -my @acc = (@acc, "%rax"); -my ($lo,$hi)=("%rdi","%rbp"); - -$code.=<<___; -.globl mulx_mont_384 -.hidden mulx_mont_384 -.type mulx_mont_384,\@function,5,"unwind" -.align 32 -mulx_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -8*3(%rsp), %rsp -.cfi_adjust_cfa_offset 8*3 -.cfi_end_prologue - - mov $b_org, $b_ptr # evacuate from %rdx - mov 8*0($b_org), %rdx - mov 8*0($a_ptr), @acc[6] - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov 8*3($a_ptr), @acc[4] - mov $r_ptr, 8*2(%rsp) - mov 8*4($a_ptr), $lo - mov 8*5($a_ptr), $hi - lea -128($a_ptr), $a_ptr # control u-op density - lea -128($n_ptr), $n_ptr # control u-op density - mov $n0, (%rsp) - - mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] - call __mulx_mont_384 - - mov 8*3(%rsp),%r15 -.cfi_restore %r15 - mov 8*4(%rsp),%r14 -.cfi_restore %r14 - mov 8*5(%rsp),%r13 -.cfi_restore %r13 - mov 8*6(%rsp),%r12 -.cfi_restore %r12 - mov 8*7(%rsp),%rbx -.cfi_restore %rbx - mov 8*8(%rsp),%rbp -.cfi_restore %rbp - lea 8*9(%rsp),%rsp -.cfi_adjust_cfa_offset -8*9 -.cfi_epilogue - ret -.cfi_endproc -.size mulx_mont_384,.-mulx_mont_384 -___ -{ my @acc=@acc; # will be rotated locally - -$code.=<<___; -.type __mulx_mont_384,\@abi-omnipotent -.align 32 -__mulx_mont_384: -.cfi_startproc - mulx @acc[7], @acc[6], @acc[2] - mulx @acc[8], @acc[7], @acc[3] - add @acc[6], @acc[1] - mulx @acc[4], @acc[8], @acc[4] - adc @acc[7], @acc[2] - mulx $lo, $lo, @acc[5] - adc @acc[8], @acc[3] - mulx $hi, $hi, @acc[6] - mov 8($b_ptr), %rdx - adc $lo, @acc[4] - adc $hi, @acc[5] - adc \$0, @acc[6] - xor @acc[7], @acc[7] - -___ -for (my $i=1; $i<6; $i++) { -my $tt = $i==1 ? @acc[7] : $hi; -my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; -$code.=<<___; - mov @acc[0], 16(%rsp) - imulq 8(%rsp), @acc[0] - - ################################# Multiply by b[$i] - xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 - mulx 8*0+128($a_ptr), $lo, $hi - adox $lo, @acc[1] - adcx $hi, @acc[2] - - mulx 8*1+128($a_ptr), $lo, $hi - adox $lo, @acc[2] - adcx $hi, @acc[3] - - mulx 8*2+128($a_ptr), $lo, $hi - adox $lo, @acc[3] - adcx $hi, @acc[4] - - mulx 8*3+128($a_ptr), $lo, $hi - adox $lo, @acc[4] - adcx $hi, @acc[5] - - mulx 8*4+128($a_ptr), $lo, $hi - adox $lo, @acc[5] - adcx $hi, @acc[6] - - mulx 8*5+128($a_ptr), $lo, $hi - mov @acc[0], %rdx - adox $lo, @acc[6] - adcx $hi, @acc[7] # cf=0 - adox @acc[8], @acc[7] - adox @acc[8], @acc[8] - - ################################# reduction - xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 - mulx 8*0+128($n_ptr), $lo, $hi - adcx 16(%rsp), $lo # guaranteed to be zero - adox $hi, @acc[1] - - mulx 8*1+128($n_ptr), $lo, $hi - adcx $lo, @acc[1] - adox $hi, @acc[2] - - mulx 8*2+128($n_ptr), $lo, $hi - adcx $lo, @acc[2] - adox $hi, @acc[3] - - mulx 8*3+128($n_ptr), $lo, $hi - adcx $lo, @acc[3] - adox $hi, @acc[4] - - mulx 8*4+128($n_ptr), $lo, $hi - adcx $lo, @acc[4] - adox $hi, @acc[5] - - mulx 8*5+128($n_ptr), $lo, $hi - mov $b_next, %rdx - adcx $lo, @acc[5] - adox $hi, @acc[6] - adcx @acc[0], @acc[6] - adox @acc[0], @acc[7] - adcx @acc[0], @acc[7] - adox @acc[0], @acc[8] - adcx @acc[0], @acc[8] -___ - push(@acc,shift(@acc)); -} -$code.=<<___; - imulq 8(%rsp), %rdx - mov 8*3(%rsp), $b_ptr # restore $r_ptr - - ################################# last reduction - xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 - mulx 8*0+128($n_ptr), $lo, $hi - adcx $lo, @acc[0] # guaranteed to be zero - adox $hi, @acc[1] - - mulx 8*1+128($n_ptr), $lo, $hi - adcx $lo, @acc[1] - adox $hi, @acc[2] - - mulx 8*2+128($n_ptr), $lo, $hi - adcx $lo, @acc[2] - adox $hi, @acc[3] - - mulx 8*3+128($n_ptr), $lo, $hi - adcx $lo, @acc[3] - adox $hi, @acc[4] - mov @acc[2], @acc[0] - - mulx 8*4+128($n_ptr), $lo, $hi - adcx $lo, @acc[4] - adox $hi, @acc[5] - mov @acc[3], $a_ptr - - mulx 8*5+128($n_ptr), $lo, $hi - adcx $lo, @acc[5] - adox $hi, @acc[6] - mov @acc[1], %rdx - adcx @acc[8], @acc[6] - adox @acc[8], @acc[7] - lea 128($n_ptr), $n_ptr - mov @acc[4], @acc[8] - adc \$0, @acc[7] - - ################################# - # Branch-less conditional acc[1:7] - modulus - - sub 8*0($n_ptr), @acc[1] - sbb 8*1($n_ptr), @acc[2] - mov @acc[5], $lo - sbb 8*2($n_ptr), @acc[3] - sbb 8*3($n_ptr), @acc[4] - sbb 8*4($n_ptr), @acc[5] - mov @acc[6], $hi - sbb 8*5($n_ptr), @acc[6] - sbb \$0, @acc[7] - - cmovnc @acc[1], %rdx - cmovc @acc[0], @acc[2] - cmovc $a_ptr, @acc[3] - cmovnc @acc[4], @acc[8] - mov %rdx, 8*0($b_ptr) - cmovnc @acc[5], $lo - mov @acc[2], 8*1($b_ptr) - cmovnc @acc[6], $hi - mov @acc[3], 8*2($b_ptr) - mov @acc[8], 8*3($b_ptr) - mov $lo, 8*4($b_ptr) - mov $hi, 8*5($b_ptr) - - ret -.cfi_endproc -.size __mulx_mont_384,.-__mulx_mont_384 -___ -} -$code.=<<___; -.globl sqrx_mont_384 -.hidden sqrx_mont_384 -.type sqrx_mont_384,\@function,4,"unwind" -.align 32 -sqrx_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -8*3(%rsp), %rsp -.cfi_adjust_cfa_offset 8*3 -.cfi_end_prologue - - mov $n_ptr, $n0 # n0 - lea -128($b_org), $n_ptr # control u-op density - mov 8*0($a_ptr), %rdx - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov 8*3($a_ptr), @acc[4] - mov $r_ptr, 8*2(%rsp) - mov 8*4($a_ptr), $lo - mov 8*5($a_ptr), $hi - - lea ($a_ptr), $b_ptr - mov $n0, (%rsp) # n0 - lea -128($a_ptr), $a_ptr # control u-op density - - mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] - call __mulx_mont_384 # as fast as dedicated squaring - - mov 8*3(%rsp),%r15 -.cfi_restore %r15 - mov 8*4(%rsp),%r14 -.cfi_restore %r14 - mov 8*5(%rsp),%r13 -.cfi_restore %r13 - mov 8*6(%rsp),%r12 -.cfi_restore %r12 - mov 8*7(%rsp),%rbx -.cfi_restore %rbx - mov 8*8(%rsp),%rbp -.cfi_restore %rbp - lea 8*9(%rsp),%rsp -.cfi_adjust_cfa_offset -8*9 -.cfi_epilogue - ret -.cfi_endproc -.size sqrx_mont_384,.-sqrx_mont_384 - -.globl sqrx_n_mul_mont_384 -.hidden sqrx_n_mul_mont_384 -.type sqrx_n_mul_mont_384,\@function,6,"unwind" -.align 32 -sqrx_n_mul_mont_384: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -8*5(%rsp), %rsp -.cfi_adjust_cfa_offset 8*5 -.cfi_end_prologue - - mov $b_org, @acc[2] # loop counter - mov 8*0($a_ptr), %rdx - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov $a_ptr, $b_ptr - mov 8*3($a_ptr), @acc[4] - mov $r_ptr, 8*2(%rsp) # to __mulx_mont_384 - mov 8*4($a_ptr), $lo - mov 8*5($a_ptr), $hi - - mov $n0, (%rsp) - mov %r9, 8*3(%rsp) # 6th, multiplicand argument - movq 8*0(%r9), %xmm2 # prefetch b[0] - -.Loop_sqrx_384: - movd @acc[2]d, %xmm1 - lea -128($b_ptr), $a_ptr # control u-op density - lea -128($n_ptr), $n_ptr # control u-op density - - mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] - call __mulx_mont_384 - - movd %xmm1, @acc[2]d - dec @acc[2]d - jnz .Loop_sqrx_384 - - mov %rdx, @acc[6] - movq %xmm2, %rdx # b[0] - lea -128($b_ptr), $a_ptr # control u-op density - mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument - lea -128($n_ptr), $n_ptr # control u-op density - - mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] - call __mulx_mont_384 - - mov 8*5(%rsp),%r15 -.cfi_restore %r15 - mov 8*6(%rsp),%r14 -.cfi_restore %r14 - mov 8*7(%rsp),%r13 -.cfi_restore %r13 - mov 8*8(%rsp),%r12 -.cfi_restore %r12 - mov 8*9(%rsp),%rbx -.cfi_restore %rbx - mov 8*10(%rsp),%rbp -.cfi_restore %rbp - lea 8*11(%rsp),%rsp -.cfi_adjust_cfa_offset -8*11 -.cfi_epilogue - ret -.cfi_endproc -.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 - -.globl sqrx_n_mul_mont_383 -.hidden sqrx_n_mul_mont_383 -.type sqrx_n_mul_mont_383,\@function,6,"unwind" -.align 32 -sqrx_n_mul_mont_383: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - lea -8*5(%rsp), %rsp -.cfi_adjust_cfa_offset 8*5 -.cfi_end_prologue - - mov $b_org, @acc[2] # loop counter - mov 8*0($a_ptr), %rdx - mov 8*1($a_ptr), @acc[7] - mov 8*2($a_ptr), @acc[8] - mov $a_ptr, $b_ptr - mov 8*3($a_ptr), @acc[4] - mov $r_ptr, 8*2(%rsp) # to __mulx_mont_383_nonred - mov 8*4($a_ptr), $lo - mov 8*5($a_ptr), $hi - - mov $n0, (%rsp) - mov %r9, 8*3(%rsp) # 6th, multiplicand argument - movq 8*0(%r9), %xmm2 # prefetch b[0] - lea -128($n_ptr), $n_ptr # control u-op density - -.Loop_sqrx_383: - movd @acc[2]d, %xmm1 - lea -128($b_ptr), $a_ptr # control u-op density - - mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] - call __mulx_mont_383_nonred # omitting full reduction gives ~15% - # in addition-chains - movd %xmm1, @acc[2]d - dec @acc[2]d - jnz .Loop_sqrx_383 - - mov %rdx, @acc[6] - movq %xmm2, %rdx # b[0] - lea -128($b_ptr), $a_ptr # control u-op density - mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument - - mulx @acc[6], @acc[0], @acc[1] # a[0]*b[0] - call __mulx_mont_384 - - mov 8*5(%rsp),%r15 -.cfi_restore %r15 - mov 8*6(%rsp),%r14 -.cfi_restore %r14 - mov 8*7(%rsp),%r13 -.cfi_restore %r13 - mov 8*8(%rsp),%r12 -.cfi_restore %r12 - mov 8*9(%rsp),%rbx -.cfi_restore %rbx - mov 8*10(%rsp),%rbp -.cfi_restore %rbp - lea 8*11(%rsp),%rsp -.cfi_adjust_cfa_offset -8*11 -.cfi_epilogue - ret -.cfi_endproc -.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 -___ -{ my @acc=@acc; # will be rotated locally - -$code.=<<___; -.type __mulx_mont_383_nonred,\@abi-omnipotent -.align 32 -__mulx_mont_383_nonred: -.cfi_startproc - mulx @acc[7], @acc[6], @acc[2] - mulx @acc[8], @acc[7], @acc[3] - add @acc[6], @acc[1] - mulx @acc[4], @acc[8], @acc[4] - adc @acc[7], @acc[2] - mulx $lo, $lo, @acc[5] - adc @acc[8], @acc[3] - mulx $hi, $hi, @acc[6] - mov 8($b_ptr), %rdx - adc $lo, @acc[4] - adc $hi, @acc[5] - adc \$0, @acc[6] -___ -for (my $i=1; $i<6; $i++) { -my $tt = $i==1 ? @acc[7] : $hi; -my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; -$code.=<<___; - mov @acc[0], @acc[8] - imulq 8(%rsp), @acc[0] - - ################################# Multiply by b[$i] - xor @acc[7], @acc[7] # @acc[8]=0, cf=0, of=0 - mulx 8*0+128($a_ptr), $lo, $hi - adox $lo, @acc[1] - adcx $hi, @acc[2] - - mulx 8*1+128($a_ptr), $lo, $hi - adox $lo, @acc[2] - adcx $hi, @acc[3] - - mulx 8*2+128($a_ptr), $lo, $hi - adox $lo, @acc[3] - adcx $hi, @acc[4] - - mulx 8*3+128($a_ptr), $lo, $hi - adox $lo, @acc[4] - adcx $hi, @acc[5] - - mulx 8*4+128($a_ptr), $lo, $hi - adox $lo, @acc[5] - adcx $hi, @acc[6] - - mulx 8*5+128($a_ptr), $lo, $hi - mov @acc[0], %rdx - adox $lo, @acc[6] - adcx @acc[7], $hi - adox $hi, @acc[7] - - ################################# reduction - xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 - mulx 8*0+128($n_ptr), $lo, $hi - adcx $lo, @acc[8] # guaranteed to be zero - adox $hi, @acc[1] - - mulx 8*1+128($n_ptr), $lo, $hi - adcx $lo, @acc[1] - adox $hi, @acc[2] - - mulx 8*2+128($n_ptr), $lo, $hi - adcx $lo, @acc[2] - adox $hi, @acc[3] - - mulx 8*3+128($n_ptr), $lo, $hi - adcx $lo, @acc[3] - adox $hi, @acc[4] - - mulx 8*4+128($n_ptr), $lo, $hi - adcx $lo, @acc[4] - adox $hi, @acc[5] - - mulx 8*5+128($n_ptr), $lo, $hi - mov $b_next, %rdx - adcx $lo, @acc[5] - adox $hi, @acc[6] - adcx @acc[8], @acc[6] - adox @acc[8], @acc[7] - adcx @acc[8], @acc[7] -___ - push(@acc,shift(@acc)); -} -$code.=<<___; - imulq 8(%rsp), %rdx - mov 8*3(%rsp), $b_ptr # restore $r_ptr - - ################################# last reduction - xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 - mulx 8*0+128($n_ptr), $lo, $hi - adcx $lo, @acc[0] # guaranteed to be zero - adox $hi, @acc[1] - - mulx 8*1+128($n_ptr), $lo, $hi - adcx $lo, @acc[1] - adox $hi, @acc[2] - - mulx 8*2+128($n_ptr), $lo, $hi - adcx $lo, @acc[2] - adox $hi, @acc[3] - - mulx 8*3+128($n_ptr), $lo, $hi - adcx $lo, @acc[3] - adox $hi, @acc[4] - - mulx 8*4+128($n_ptr), $lo, $hi - adcx $lo, @acc[4] - adox $hi, @acc[5] - - mulx 8*5+128($n_ptr), $lo, $hi - mov @acc[1], %rdx - adcx $lo, @acc[5] - adox $hi, @acc[6] - adc \$0, @acc[6] - mov @acc[4], @acc[8] - - mov @acc[1], 8*0($b_ptr) - mov @acc[2], 8*1($b_ptr) - mov @acc[3], 8*2($b_ptr) - mov @acc[5], $lo - mov @acc[4], 8*3($b_ptr) - mov @acc[5], 8*4($b_ptr) - mov @acc[6], 8*5($b_ptr) - mov @acc[6], $hi - - ret -.cfi_endproc -.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred -___ -} } } -{ my $frame = 4*8 + # place for argument off-load + - 2*384/8 + # place for 2 384-bit temporary vectors - 8; # align -my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); - -# omitting 3 reductions gives ~10% better performance in add-chains -$code.=<<___; -.globl sqrx_mont_382x -.hidden sqrx_mont_382x -.type sqrx_mont_382x,\@function,4,"unwind" -.align 32 -sqrx_mont_382x: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - sub \$$frame, %rsp -.cfi_adjust_cfa_offset $frame -.cfi_end_prologue - - mov $n_ptr, 8*0(%rsp) # n0 - mov $b_org, $n_ptr # n_ptr - mov $r_ptr, 8*2(%rsp) - mov $a_ptr, 8*3(%rsp) - - ################################# - mov 8*0($a_ptr), @acc[0] # a->re - mov 8*1($a_ptr), @acc[1] - mov 8*2($a_ptr), @acc[2] - mov 8*3($a_ptr), @acc[3] - mov 8*4($a_ptr), @acc[4] - mov 8*5($a_ptr), @acc[5] - - mov @acc[0], @acc[6] - add 8*6($a_ptr), @acc[0] # a->re + a->im - mov @acc[1], @acc[7] - adc 8*7($a_ptr), @acc[1] - mov @acc[2], @acc[8] - adc 8*8($a_ptr), @acc[2] - mov @acc[3], @acc[9] - adc 8*9($a_ptr), @acc[3] - mov @acc[4], @acc[10] - adc 8*10($a_ptr), @acc[4] - mov @acc[5], @acc[11] - adc 8*11($a_ptr), @acc[5] - - sub 8*6($a_ptr), @acc[6] # a->re - a->im - sbb 8*7($a_ptr), @acc[7] - sbb 8*8($a_ptr), @acc[8] - sbb 8*9($a_ptr), @acc[9] - sbb 8*10($a_ptr), @acc[10] - sbb 8*11($a_ptr), @acc[11] - sbb $r_ptr, $r_ptr # borrow flag as mask - - mov @acc[0], 32+8*0(%rsp) # t0 - mov @acc[1], 32+8*1(%rsp) - mov @acc[2], 32+8*2(%rsp) - mov @acc[3], 32+8*3(%rsp) - mov @acc[4], 32+8*4(%rsp) - mov @acc[5], 32+8*5(%rsp) - - mov @acc[6], 32+8*6(%rsp) # t1 - mov @acc[7], 32+8*7(%rsp) - mov @acc[8], 32+8*8(%rsp) - mov @acc[9], 32+8*9(%rsp) - mov @acc[10], 32+8*10(%rsp) - mov @acc[11], 32+8*11(%rsp) - mov $r_ptr, 32+8*12(%rsp) - - ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); - #mov 8*3(%rsp), $a_ptr # a->re - lea 48($a_ptr), $b_ptr # a->im - - mov 48($a_ptr), %rdx - mov 8*0($a_ptr), %r14 # @acc[6] - mov 8*1($a_ptr), %r15 # @acc[7] - mov 8*2($a_ptr), %rax # @acc[8] - mov 8*3($a_ptr), %r12 # @acc[4] - mov 8*4($a_ptr), %rdi # $lo - mov 8*5($a_ptr), %rbp # $hi - lea -128($a_ptr), $a_ptr # control u-op density - lea -128($n_ptr), $n_ptr # control u-op density - - mulx %r14, %r8, %r9 - call __mulx_mont_383_nonred -___ -{ -my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 - 8..11,13,14); -$code.=<<___; - add @acc[0], @acc[0] # add with itself - adc @acc[1], @acc[1] - adc @acc[2], @acc[2] - adc @acc[3], @acc[3] - adc @acc[4], @acc[4] - adc @acc[5], @acc[5] - - mov @acc[0], 8*6($b_ptr) # ret->im - mov @acc[1], 8*7($b_ptr) - mov @acc[2], 8*8($b_ptr) - mov @acc[3], 8*9($b_ptr) - mov @acc[4], 8*10($b_ptr) - mov @acc[5], 8*11($b_ptr) -___ -} -$code.=<<___; - ################################# mul_mont_384(ret->re, t0, t1, mod, n0); - lea 32-128(%rsp), $a_ptr # t0 [+u-op density] - lea 32+8*6(%rsp), $b_ptr # t1 - - mov 32+8*6(%rsp), %rdx # t1[0] - mov 32+8*0(%rsp), %r14 # @acc[6] - mov 32+8*1(%rsp), %r15 # @acc[7] - mov 32+8*2(%rsp), %rax # @acc[8] - mov 32+8*3(%rsp), %r12 # @acc[4] - mov 32+8*4(%rsp), %rdi # $lo - mov 32+8*5(%rsp), %rbp # $hi - #lea -128($a_ptr), $a_ptr # control u-op density - #lea -128($n_ptr), $n_ptr # control u-op density - - mulx %r14, %r8, %r9 - call __mulx_mont_383_nonred -___ -{ -my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 - 8..11,13,14); -$code.=<<___; - mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im - lea 128($n_ptr), $n_ptr - mov 32+8*0(%rsp), @acc[6] - and @acc[11], @acc[6] - mov 32+8*1(%rsp), @acc[7] - and @acc[11], @acc[7] - mov 32+8*2(%rsp), @acc[8] - and @acc[11], @acc[8] - mov 32+8*3(%rsp), @acc[9] - and @acc[11], @acc[9] - mov 32+8*4(%rsp), @acc[10] - and @acc[11], @acc[10] - and 32+8*5(%rsp), @acc[11] - - sub @acc[6], @acc[0] - mov 8*0($n_ptr), @acc[6] - sbb @acc[7], @acc[1] - mov 8*1($n_ptr), @acc[7] - sbb @acc[8], @acc[2] - mov 8*2($n_ptr), @acc[8] - sbb @acc[9], @acc[3] - mov 8*3($n_ptr), @acc[9] - sbb @acc[10], @acc[4] - mov 8*4($n_ptr), @acc[10] - sbb @acc[11], @acc[5] - sbb @acc[11], @acc[11] - - and @acc[11], @acc[6] - and @acc[11], @acc[7] - and @acc[11], @acc[8] - and @acc[11], @acc[9] - and @acc[11], @acc[10] - and 8*5($n_ptr), @acc[11] - - add @acc[6], @acc[0] - adc @acc[7], @acc[1] - adc @acc[8], @acc[2] - adc @acc[9], @acc[3] - adc @acc[10], @acc[4] - adc @acc[11], @acc[5] - - mov @acc[0], 8*0($b_ptr) # ret->re - mov @acc[1], 8*1($b_ptr) - mov @acc[2], 8*2($b_ptr) - mov @acc[3], 8*3($b_ptr) - mov @acc[4], 8*4($b_ptr) - mov @acc[5], 8*5($b_ptr) -___ -} -$code.=<<___; - lea $frame(%rsp), %r8 # size optimization - mov 8*0(%r8),%r15 -.cfi_restore %r15 - mov 8*1(%r8),%r14 -.cfi_restore %r14 - mov 8*2(%r8),%r13 -.cfi_restore %r13 - mov 8*3(%r8),%r12 -.cfi_restore %r12 - mov 8*4(%r8),%rbx -.cfi_restore %rbx - mov 8*5(%r8),%rbp -.cfi_restore %rbp - lea 8*6(%r8),%rsp -.cfi_adjust_cfa_offset -$frame-8*6 -.cfi_epilogue - ret -.cfi_endproc -.size sqrx_mont_382x,.-sqrx_mont_382x -___ -} - -print $code; -close STDOUT; diff --git a/crypto/blst_src/asm/sha256-armv8.pl b/crypto/blst_src/asm/sha256-armv8.pl deleted file mode 100755 index 1de27c70667..00000000000 --- a/crypto/blst_src/asm/sha256-armv8.pl +++ /dev/null @@ -1,541 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -# project. -# ==================================================================== -# -# sha256_block procedure for ARMv8. -# -# This module is stripped of scalar code paths, with raionale that all -# known processors are NEON-capable. -# -# See original module at CRYPTOGAMS for further details. - -$flavour = shift; -$output = shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -$BITS=256; -$SZ=4; -@Sigma0=( 2,13,22); -@Sigma1=( 6,11,25); -@sigma0=( 7,18, 3); -@sigma1=(17,19,10); -$rounds=64; -$reg_t="w"; -$pre="blst_"; - -($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); - -$code.=<<___; -.text - -.align 6 -.type .LK$BITS,%object -.LK$BITS: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0 //terminator -.size .LK$BITS,.-.LK$BITS -.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm" -.align 2 -___ - -if ($SZ==4) { -my $Ktbl="x3"; - -my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); -my @MSG=map("v$_.16b",(4..7)); -my ($W0,$W1)=("v16.4s","v17.4s"); -my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); - -$code.=<<___; -.globl ${pre}sha256_block_armv8 -.type ${pre}sha256_block_armv8,%function -.align 6 -${pre}sha256_block_armv8: -.Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1.32 {$ABCD,$EFGH},[$ctx] - adr $Ktbl,.LK256 - -.Loop_hw: - ld1 {@MSG[0]-@MSG[3]},[$inp],#64 - sub $num,$num,#1 - ld1.32 {$W0},[$Ktbl],#16 - rev32 @MSG[0],@MSG[0] - rev32 @MSG[1],@MSG[1] - rev32 @MSG[2],@MSG[2] - rev32 @MSG[3],@MSG[3] - orr $ABCD_SAVE,$ABCD,$ABCD // offload - orr $EFGH_SAVE,$EFGH,$EFGH -___ -for($i=0;$i<12;$i++) { -$code.=<<___; - ld1.32 {$W1},[$Ktbl],#16 - add.i32 $W0,$W0,@MSG[0] - sha256su0 @MSG[0],@MSG[1] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - sha256su1 @MSG[0],@MSG[2],@MSG[3] -___ - ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); -} -$code.=<<___; - ld1.32 {$W1},[$Ktbl],#16 - add.i32 $W0,$W0,@MSG[0] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - ld1.32 {$W0},[$Ktbl],#16 - add.i32 $W1,$W1,@MSG[1] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - ld1.32 {$W1},[$Ktbl] - add.i32 $W0,$W0,@MSG[2] - sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - add.i32 $W1,$W1,@MSG[3] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - add.i32 $ABCD,$ABCD,$ABCD_SAVE - add.i32 $EFGH,$EFGH,$EFGH_SAVE - - cbnz $num,.Loop_hw - - st1.32 {$ABCD,$EFGH},[$ctx] - - ldr x29,[sp],#16 - ret -.size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8 -___ -} - -if ($SZ==4) { ######################################### NEON stuff # -# You'll surely note a lot of similarities with sha256-armv4 module, -# and of course it's not a coincidence. sha256-armv4 was used as -# initial template, but was adapted for ARMv8 instruction set and -# extensively re-tuned for all-round performance. - -my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); -my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); -my $Ktbl="x16"; -my $Xfer="x17"; -my @X = map("q$_",(0..3)); -my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); -my $j=0; - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; - my $arg = pop; - $arg = "#$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; -} - -sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } -sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } -sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } - -sub Xupdate() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T2,$T0,$sigma0[0]); - eval(shift(@insns)); - &ushr_32 ($T1,$T0,$sigma0[2]); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] - eval(shift(@insns)); - &sli_32 ($T2,$T0,32-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T3,$T0,$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T1,$T1,$T2); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T3,$T0,32-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T4,$T7,$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T4,$T7,32-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T5,$T7,$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T3,$T7,$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &sli_u32 ($T3,$T7,32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T4); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T6,@X[0],$sigma1[0]); - eval(shift(@insns)); - &ushr_32 ($T7,@X[0],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T6,@X[0],32-$sigma1[0]); - eval(shift(@insns)); - &ushr_32 ($T5,@X[0],$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T7,$T7,$T6); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T5,@X[0],32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_32 ("{$T0}","[$Ktbl], #16"); - eval(shift(@insns)); - &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T5); - eval(shift(@insns)); - eval(shift(@insns)); - &mov (&Dhi($T5), &Dlo($T7)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 ($T0,$T0,@X[0]); - while($#insns>=1) { eval(shift(@insns)); } - &st1_32 ("{$T0}","[$Xfer], #16"); - eval(shift(@insns)); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xpreload() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_8 ("{@X[0]}","[$inp],#16"); - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_32 ("{$T0}","[$Ktbl],#16"); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &rev32 (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 ($T0,$T0,@X[0]); - foreach (@insns) { eval; } # remaining instructions - &st1_32 ("{$T0}","[$Xfer], #16"); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub body_00_15 () { - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. - '&add ($h,$h,$t1)', # h+=X[i]+K[i] - '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past - '&and ($t1,$f,$e)', - '&bic ($t4,$g,$e)', - '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', - '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past - '&orr ($t1,$t1,$t4)', # Ch(e,f,g) - '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) - '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', - '&add ($h,$h,$t1)', # h+=Ch(e,f,g) - '&ror ($t0,$t0,"#$Sigma1[0]")', - '&eor ($t2,$a,$b)', # a^b, b^c in next round - '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) - '&add ($h,$h,$t0)', # h+=Sigma1(e) - '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. - '&ldr ($t1,"[$Ktbl]") if ($j==15);'. - '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) - '&ror ($t4,$t4,"#$Sigma0[0]")', - '&add ($d,$d,$h)', # d+=h - '&eor ($t3,$t3,$b)', # Maj(a,b,c) - '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' - ) -} - -$code.=<<___; -.globl ${pre}sha256_block_data_order -.type ${pre}sha256_block_data_order,%function -.align 4 -${pre}sha256_block_data_order: - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 - - adr $Ktbl,.LK256 - add $num,$inp,$num,lsl#6 // len to point at the end of inp - - ld1.8 {@X[0]},[$inp], #16 - ld1.8 {@X[1]},[$inp], #16 - ld1.8 {@X[2]},[$inp], #16 - ld1.8 {@X[3]},[$inp], #16 - ld1.32 {$T0},[$Ktbl], #16 - ld1.32 {$T1},[$Ktbl], #16 - ld1.32 {$T2},[$Ktbl], #16 - ld1.32 {$T3},[$Ktbl], #16 - rev32 @X[0],@X[0] // yes, even on - rev32 @X[1],@X[1] // big-endian - rev32 @X[2],@X[2] - rev32 @X[3],@X[3] - mov $Xfer,sp - add.32 $T0,$T0,@X[0] - add.32 $T1,$T1,@X[1] - add.32 $T2,$T2,@X[2] - st1.32 {$T0-$T1},[$Xfer], #32 - add.32 $T3,$T3,@X[3] - st1.32 {$T2-$T3},[$Xfer] - sub $Xfer,$Xfer,#32 - - ldp $A,$B,[$ctx] - ldp $C,$D,[$ctx,#8] - ldp $E,$F,[$ctx,#16] - ldp $G,$H,[$ctx,#24] - ldr $t1,[sp,#0] - mov $t2,wzr - eor $t3,$B,$C - mov $t4,wzr - b .L_00_48 - -.align 4 -.L_00_48: -___ - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); -$code.=<<___; - cmp $t1,#0 // check for K256 terminator - ldr $t1,[sp,#0] - sub $Xfer,$Xfer,#64 - bne .L_00_48 - - sub $Ktbl,$Ktbl,#256 // rewind $Ktbl - cmp $inp,$num - mov $Xfer, #64 - csel $Xfer, $Xfer, xzr, eq - sub $inp,$inp,$Xfer // avoid SEGV - mov $Xfer,sp -___ - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); -$code.=<<___; - add $A,$A,$t4 // h+=Sigma0(a) from the past - ldp $t0,$t1,[$ctx,#0] - add $A,$A,$t2 // h+=Maj(a,b,c) from the past - ldp $t2,$t3,[$ctx,#8] - add $A,$A,$t0 // accumulate - add $B,$B,$t1 - ldp $t0,$t1,[$ctx,#16] - add $C,$C,$t2 - add $D,$D,$t3 - ldp $t2,$t3,[$ctx,#24] - add $E,$E,$t0 - add $F,$F,$t1 - ldr $t1,[sp,#0] - stp $A,$B,[$ctx,#0] - add $G,$G,$t2 - mov $t2,wzr - stp $C,$D,[$ctx,#8] - add $H,$H,$t3 - stp $E,$F,[$ctx,#16] - eor $t3,$B,$C - stp $G,$H,[$ctx,#24] - mov $t4,wzr - mov $Xfer,sp - b.ne .L_00_48 - - ldr x29,[x29] - add sp,sp,#16*4+16 - ret -.size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order -___ -} - -{ -my ($out,$inp,$len) = map("x$_",(0..2)); - -$code.=<<___; -.globl ${pre}sha256_emit -.hidden ${pre}sha256_emit -.type ${pre}sha256_emit,%function -.align 4 -${pre}sha256_emit: - ldp x4,x5,[$inp] - ldp x6,x7,[$inp,#16] -#ifndef __AARCH64EB__ - rev x4,x4 - rev x5,x5 - rev x6,x6 - rev x7,x7 -#endif - str w4,[$out,#4] - lsr x4,x4,#32 - str w5,[$out,#12] - lsr x5,x5,#32 - str w6,[$out,#20] - lsr x6,x6,#32 - str w7,[$out,#28] - lsr x7,x7,#32 - str w4,[$out,#0] - str w5,[$out,#8] - str w6,[$out,#16] - str w7,[$out,#24] - ret -.size ${pre}sha256_emit,.-${pre}sha256_emit - -.globl ${pre}sha256_bcopy -.hidden ${pre}sha256_bcopy -.type ${pre}sha256_bcopy,%function -.align 4 -${pre}sha256_bcopy: -.Loop_bcopy: - ldrb w3,[$inp],#1 - sub $len,$len,#1 - strb w3,[$out],#1 - cbnz $len,.Loop_bcopy - ret -.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy - -.globl ${pre}sha256_hcopy -.hidden ${pre}sha256_hcopy -.type ${pre}sha256_hcopy,%function -.align 4 -${pre}sha256_hcopy: - ldp x4,x5,[$inp] - ldp x6,x7,[$inp,#16] - stp x4,x5,[$out] - stp x6,x7,[$out,#16] - ret -.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy -___ -} - -{ my %opcode = ( - "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, - "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); - - sub unsha256 { - my ($mnemonic,$arg)=@_; - - $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o - && - sprintf ".inst\t0x%08x\t//%s %s", - $opcode{$mnemonic}|$1|($2<<5)|($3<<16), - $mnemonic,$arg; - } -} - -open SELF,$0; -while() { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach(split("\n",$code)) { - - s/\`([^\`]*)\`/eval($1)/ge; - - s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or - s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; - - s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers - - s/\.[ui]?8(\s)/$1/; - s/\.\w?64\b// and s/\.16b/\.2d/g or - s/\.\w?32\b// and s/\.16b/\.4s/g; - m/\bext\b/ and s/\.2d/\.16b/g or - m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; - - print $_,"\n"; -} - -close STDOUT; diff --git a/crypto/blst_src/asm/sha256-portable-x86_64.pl b/crypto/blst_src/asm/sha256-portable-x86_64.pl deleted file mode 100755 index eca0564ebe7..00000000000 --- a/crypto/blst_src/asm/sha256-portable-x86_64.pl +++ /dev/null @@ -1,337 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -# project. -# ==================================================================== -# -# sha256_block procedure for x86_64. -# -# Scalar-only version with minor twist minimizing 'lea' instructions. - -$flavour = shift; -$output = pop; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -$pre="blst_"; -$func="${pre}sha256_block_data_order"; -$TABLE="K256"; -$SZ=4; -@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", - "%r8d","%r9d","%r10d","%r11d"); -($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); -@Sigma0=( 2,13,22); -@Sigma1=( 6,11,25); -@sigma0=( 7,18, 3); -@sigma1=(17,19,10); -$rounds=64; - -$ctx="%rdi"; # 1st arg, zapped by $a3 -$inp="%rsi"; # 2nd arg -$Tbl="%rbp"; - -$_ctx="16*$SZ+0*8(%rsp)"; -$_inp="16*$SZ+1*8(%rsp)"; -$_end="16*$SZ+2*8(%rsp)"; -$framesz="16*$SZ+3*8"; - -sub ROUND_00_15() -{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; - my $STRIDE=$SZ; - # $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); - -$code.=<<___; - ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 - mov $f,$a2 - - xor $e,$a0 - ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 - xor $g,$a2 # f^g - - mov $T1,`$SZ*($i&0xf)`(%rsp) - xor $a,$a1 - and $e,$a2 # (f^g)&e - - ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 - add $h,$T1 # T1+=h - xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g - - ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 - xor $e,$a0 - add $a2,$T1 # T1+=Ch(e,f,g) - - mov $a,$a2 - add `$SZ*$i`($Tbl),$T1 # T1+=K[round] - xor $a,$a1 - - xor $b,$a2 # a^b, b^c in next round - ror \$$Sigma1[0],$a0 # Sigma1(e) - mov $b,$h - - and $a2,$a3 - ror \$$Sigma0[0],$a1 # Sigma0(a) - add $a0,$T1 # T1+=Sigma1(e) - - xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) - add $T1,$d # d+=T1 - add $T1,$h # h+=T1 -___ -$code.=<<___ if ($i==31); - lea `16*$SZ`($Tbl),$Tbl # round+=16 -___ -$code.=<<___ if ($i<15); - add $a1,$h # h+=Sigma0(a) -___ - ($a2,$a3) = ($a3,$a2); -} - -sub ROUND_16_XX() -{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; - -$code.=<<___; - mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 - mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 - - mov $a0,$T1 - ror \$`$sigma0[1]-$sigma0[0]`,$a0 - add $a1,$a # modulo-scheduled h+=Sigma0(a) - mov $a2,$a1 - ror \$`$sigma1[1]-$sigma1[0]`,$a2 - - xor $T1,$a0 - shr \$$sigma0[2],$T1 - ror \$$sigma0[0],$a0 - xor $a1,$a2 - shr \$$sigma1[2],$a1 - - ror \$$sigma1[0],$a2 - xor $a0,$T1 # sigma0(X[(i+1)&0xf]) - xor $a1,$a2 # sigma1(X[(i+14)&0xf]) - add `$SZ*(($i+9)&0xf)`(%rsp),$T1 - - add `$SZ*($i&0xf)`(%rsp),$T1 - mov $e,$a0 - add $a2,$T1 - mov $a,$a1 -___ - &ROUND_00_15(@_); -} - -$code=<<___; -.text - -.globl $func -.type $func,\@function,3,"unwind" -.align 16 -$func: -.cfi_startproc - push %rbx -.cfi_push %rbx - push %rbp -.cfi_push %rbp - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - shl \$4,%rdx # num*16 - sub \$$framesz,%rsp -.cfi_adjust_cfa_offset $framesz - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - mov $ctx,$_ctx # save ctx, 1st arg - mov $inp,$_inp # save inp, 2nd arh - mov %rdx,$_end # save end pointer, "3rd" arg -.cfi_end_prologue - - mov $SZ*0($ctx),$A - mov $SZ*1($ctx),$B - mov $SZ*2($ctx),$C - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H - jmp .Lloop - -.align 16 -.Lloop: - mov $B,$a3 - lea $TABLE(%rip),$Tbl - xor $C,$a3 # magic -___ - for($i=0;$i<16;$i++) { - $code.=" mov $SZ*$i($inp),$T1\n"; - $code.=" mov @ROT[4],$a0\n"; - $code.=" mov @ROT[0],$a1\n"; - $code.=" bswap $T1\n"; - &ROUND_00_15($i,@ROT); - unshift(@ROT,pop(@ROT)); - } -$code.=<<___; - jmp .Lrounds_16_xx -.align 16 -.Lrounds_16_xx: -___ - for(;$i<32;$i++) { - &ROUND_16_XX($i,@ROT); - unshift(@ROT,pop(@ROT)); - } - -$code.=<<___; - cmpb \$0x19,`$SZ-1`($Tbl) - jnz .Lrounds_16_xx - - mov $_ctx,$ctx - add $a1,$A # modulo-scheduled h+=Sigma0(a) - lea 16*$SZ($inp),$inp - - add $SZ*0($ctx),$A - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - jb .Lloop - - lea $framesz+6*8(%rsp),%r11 -.cfi_def_cfa %r11,8 - mov $framesz(%rsp),%r15 -.cfi_restore %r15 - mov -40(%r11),%r14 -.cfi_restore %r14 - mov -32(%r11),%r13 -.cfi_restore %r13 - mov -24(%r11),%r12 -.cfi_restore %r12 - mov -16(%r11),%rbp -.cfi_restore %rbp - mov -8(%r11),%rbx -.cfi_restore %rbx -.cfi_epilogue - lea (%r11),%rsp - ret -.cfi_endproc -.size $func,.-$func - -.align 64 -.type $TABLE,\@object -$TABLE: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - - .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" -___ -{ -my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order - ("%rdi","%rsi","%rdx"); # Unix order -$code.=<<___; -.globl ${pre}sha256_emit -.hidden ${pre}sha256_emit -.type ${pre}sha256_emit,\@abi-omnipotent -.align 16 -${pre}sha256_emit: - mov 0($inp), %r8 - mov 8($inp), %r9 - mov 16($inp), %r10 - bswap %r8 - mov 24($inp), %r11 - bswap %r9 - mov %r8d, 4($out) - bswap %r10 - mov %r9d, 12($out) - bswap %r11 - mov %r10d, 20($out) - shr \$32, %r8 - mov %r11d, 28($out) - shr \$32, %r9 - mov %r8d, 0($out) - shr \$32, %r10 - mov %r9d, 8($out) - shr \$32, %r11 - mov %r10d, 16($out) - mov %r11d, 24($out) - ret -.size ${pre}sha256_emit,.-${pre}sha256_emit - -.globl ${pre}sha256_bcopy -.hidden ${pre}sha256_bcopy -.type ${pre}sha256_bcopy,\@abi-omnipotent -.align 16 -${pre}sha256_bcopy: - sub $inp, $out -.Loop_bcopy: - movzb ($inp), %eax - lea 1($inp), $inp - mov %al, -1($out,$inp) - dec $len - jnz .Loop_bcopy - ret -.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy - -.globl ${pre}sha256_hcopy -.hidden ${pre}sha256_hcopy -.type ${pre}sha256_hcopy,\@abi-omnipotent -.align 16 -${pre}sha256_hcopy: - mov 0($inp), %r8 - mov 8($inp), %r9 - mov 16($inp), %r10 - mov 24($inp), %r11 - mov %r8, 0($out) - mov %r9, 8($out) - mov %r10, 16($out) - mov %r11, 24($out) - ret -.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy -___ -} - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - print $_,"\n"; -} -close STDOUT; diff --git a/crypto/blst_src/asm/sha256-x86_64.pl b/crypto/blst_src/asm/sha256-x86_64.pl deleted file mode 100755 index 22b376318fa..00000000000 --- a/crypto/blst_src/asm/sha256-x86_64.pl +++ /dev/null @@ -1,789 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -# project. -# ==================================================================== -# -# sha256_block procedure for x86_64. -# -# This module is stripped of AVX and even scalar code paths, with -# raionale that -# -# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one* -# processor, venerable Sandy Bridge; -# b) AVX2 incurs costly power transitions, which would be justifiable -# if AVX2 code was executing most of the time, which is not the -# case in the context; -# c) all comtemporary processors support SSSE3, so that nobody would -# actually use scalar code path anyway; -# -# See original module at CRYPTOGAMS for further details. - -$flavour = shift; -$output = pop; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or -( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or -die "can't locate x86_64-xlate.pl"; - -open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" - or die "can't call $xlate: $!"; - -$pre="blst_"; -$func="${pre}sha256_block_data_order"; -$TABLE="K256"; -$SZ=4; -@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", - "%r8d","%r9d","%r10d","%r11d"); -($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); -@Sigma0=( 2,13,22); -@Sigma1=( 6,11,25); -@sigma0=( 7,18, 3); -@sigma1=(17,19,10); -$rounds=64; - -$ctx="%rdi"; # 1st arg, zapped by $a3 -$inp="%rsi"; # 2nd arg -$Tbl="%rbp"; - -$_ctx="16*$SZ+0*8(%rsp)"; -$_inp="16*$SZ+1*8(%rsp)"; -$_end="16*$SZ+2*8(%rsp)"; -$framesz="16*$SZ+3*8"; - -$code=<<___; -.text - -.align 64 -.type $TABLE,\@object -$TABLE: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - - .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f - .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff - .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 - .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" -___ - -###################################################################### -# SIMD code paths -# -{{{ -###################################################################### -# Intel SHA Extensions implementation of SHA256 update function. -# -my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); - -my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); -my @MSG=map("%xmm$_",(3..6)); - -$code.=<<___; -.globl ${pre}sha256_block_data_order_shaext -.hidden ${pre}sha256_block_data_order_shaext -.type ${pre}sha256_block_data_order_shaext,\@function,3,"unwind" -.align 64 -${pre}sha256_block_data_order_shaext: -.cfi_startproc -___ -$code.=<<___ if ($win64); - sub \$0x58,%rsp -.cfi_adjust_cfa_offset 0x58 - movaps %xmm6,-0x58(%r11) -.cfi_offset %xmm6,-0x60 - movaps %xmm7,-0x48(%r11) -.cfi_offset %xmm7,-0x50 - movaps %xmm8,-0x38(%r11) -.cfi_offset %xmm8,-0x40 - movaps %xmm9,-0x28(%r11) -.cfi_offset %xmm9,-0x30 - movaps %xmm10,-0x18(%r11) -.cfi_offset %xmm10,-0x20 -.cfi_end_prologue -___ -$code.=<<___; - lea K256+0x80(%rip),$Tbl - movdqu ($ctx),$ABEF # DCBA - movdqu 16($ctx),$CDGH # HGFE - movdqa 0x100-0x80($Tbl),$TMP # byte swap mask - - pshufd \$0x1b,$ABEF,$Wi # ABCD - pshufd \$0xb1,$ABEF,$ABEF # CDAB - pshufd \$0x1b,$CDGH,$CDGH # EFGH - movdqa $TMP,$BSWAP # offload - palignr \$8,$CDGH,$ABEF # ABEF - punpcklqdq $Wi,$CDGH # CDGH - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - movdqu ($inp),@MSG[0] - movdqu 0x10($inp),@MSG[1] - movdqu 0x20($inp),@MSG[2] - pshufb $TMP,@MSG[0] - movdqu 0x30($inp),@MSG[3] - - movdqa 0*16-0x80($Tbl),$Wi - paddd @MSG[0],$Wi - pshufb $TMP,@MSG[1] - movdqa $CDGH,$CDGH_SAVE # offload - sha256rnds2 $ABEF,$CDGH # 0-3 - pshufd \$0x0e,$Wi,$Wi - nop - movdqa $ABEF,$ABEF_SAVE # offload - sha256rnds2 $CDGH,$ABEF - - movdqa 1*16-0x80($Tbl),$Wi - paddd @MSG[1],$Wi - pshufb $TMP,@MSG[2] - sha256rnds2 $ABEF,$CDGH # 4-7 - pshufd \$0x0e,$Wi,$Wi - lea 0x40($inp),$inp - sha256msg1 @MSG[1],@MSG[0] - sha256rnds2 $CDGH,$ABEF - - movdqa 2*16-0x80($Tbl),$Wi - paddd @MSG[2],$Wi - pshufb $TMP,@MSG[3] - sha256rnds2 $ABEF,$CDGH # 8-11 - pshufd \$0x0e,$Wi,$Wi - movdqa @MSG[3],$TMP - palignr \$4,@MSG[2],$TMP - nop - paddd $TMP,@MSG[0] - sha256msg1 @MSG[2],@MSG[1] - sha256rnds2 $CDGH,$ABEF - - movdqa 3*16-0x80($Tbl),$Wi - paddd @MSG[3],$Wi - sha256msg2 @MSG[3],@MSG[0] - sha256rnds2 $ABEF,$CDGH # 12-15 - pshufd \$0x0e,$Wi,$Wi - movdqa @MSG[0],$TMP - palignr \$4,@MSG[3],$TMP - nop - paddd $TMP,@MSG[1] - sha256msg1 @MSG[3],@MSG[2] - sha256rnds2 $CDGH,$ABEF -___ -for($i=4;$i<16-3;$i++) { -$code.=<<___; - movdqa $i*16-0x80($Tbl),$Wi - paddd @MSG[0],$Wi - sha256msg2 @MSG[0],@MSG[1] - sha256rnds2 $ABEF,$CDGH # 16-19... - pshufd \$0x0e,$Wi,$Wi - movdqa @MSG[1],$TMP - palignr \$4,@MSG[0],$TMP - nop - paddd $TMP,@MSG[2] - sha256msg1 @MSG[0],@MSG[3] - sha256rnds2 $CDGH,$ABEF -___ - push(@MSG,shift(@MSG)); -} -$code.=<<___; - movdqa 13*16-0x80($Tbl),$Wi - paddd @MSG[0],$Wi - sha256msg2 @MSG[0],@MSG[1] - sha256rnds2 $ABEF,$CDGH # 52-55 - pshufd \$0x0e,$Wi,$Wi - movdqa @MSG[1],$TMP - palignr \$4,@MSG[0],$TMP - sha256rnds2 $CDGH,$ABEF - paddd $TMP,@MSG[2] - - movdqa 14*16-0x80($Tbl),$Wi - paddd @MSG[1],$Wi - sha256rnds2 $ABEF,$CDGH # 56-59 - pshufd \$0x0e,$Wi,$Wi - sha256msg2 @MSG[1],@MSG[2] - movdqa $BSWAP,$TMP - sha256rnds2 $CDGH,$ABEF - - movdqa 15*16-0x80($Tbl),$Wi - paddd @MSG[2],$Wi - nop - sha256rnds2 $ABEF,$CDGH # 60-63 - pshufd \$0x0e,$Wi,$Wi - dec $num - nop - sha256rnds2 $CDGH,$ABEF - - paddd $CDGH_SAVE,$CDGH - paddd $ABEF_SAVE,$ABEF - jnz .Loop_shaext - - pshufd \$0xb1,$CDGH,$CDGH # DCHG - pshufd \$0x1b,$ABEF,$TMP # FEBA - pshufd \$0xb1,$ABEF,$ABEF # BAFE - punpckhqdq $CDGH,$ABEF # DCBA - palignr \$8,$TMP,$CDGH # HGFE - - movdqu $ABEF,($ctx) - movdqu $CDGH,16($ctx) -___ -$code.=<<___ if ($win64); - movaps -0x58(%r11),%xmm6 - movaps -0x48(%r11),%xmm7 - movaps -0x38(%r11),%xmm8 - movaps -0x28(%r11),%xmm9 - movaps -0x18(%r11),%xmm10 - mov %r11,%rsp -.cfi_def_cfa %r11,8 -.cfi_epilogue -___ -$code.=<<___; - ret -.cfi_endproc -.size ${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext -___ -}}} -{{{ - -my $a4=$T1; -my ($a,$b,$c,$d,$e,$f,$g,$h); - -sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; - my $arg = pop; - $arg = "\$$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; -} - -sub body_00_15 () { - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. - - '&ror ($a0,$Sigma1[2]-$Sigma1[1])', - '&mov ($a,$a1)', - '&mov ($a4,$f)', - - '&ror ($a1,$Sigma0[2]-$Sigma0[1])', - '&xor ($a0,$e)', - '&xor ($a4,$g)', # f^g - - '&ror ($a0,$Sigma1[1]-$Sigma1[0])', - '&xor ($a1,$a)', - '&and ($a4,$e)', # (f^g)&e - - '&xor ($a0,$e)', - '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] - '&mov ($a2,$a)', - - '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g - '&ror ($a1,$Sigma0[1]-$Sigma0[0])', - '&xor ($a2,$b)', # a^b, b^c in next round - - '&add ($h,$a4)', # h+=Ch(e,f,g) - '&ror ($a0,$Sigma1[0])', # Sigma1(e) - '&and ($a3,$a2)', # (b^c)&(a^b) - - '&xor ($a1,$a)', - '&add ($h,$a0)', # h+=Sigma1(e) - '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) - - '&ror ($a1,$Sigma0[0])', # Sigma0(a) - '&add ($d,$h)', # d+=h - '&add ($h,$a3)', # h+=Maj(a,b,c) - - '&mov ($a0,$d)', - '&add ($a1,$h);'. # h+=Sigma0(a) - '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' - ); -} - -###################################################################### -# SSSE3 code path -# -{ -my $Tbl = $inp; -my $_ctx="0(%rbp)"; -my $_inp="8(%rbp)"; -my $_end="16(%rbp)"; -my $framesz=4*8+$win64*16*4+8; - -my @X = map("%xmm$_",(0..3)); -my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); - -$code.=<<___; -.globl ${func} -.hidden ${func} -.type ${func},\@function,3,"unwind" -.align 64 -${func}: -.cfi_startproc - push %rbp -.cfi_push %rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - shl \$4,%rdx # num*16 - sub \$$framesz,%rsp -.cfi_adjust_cfa_offset $framesz - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - mov $ctx,0(%rsp) # save ctx, 1st arg - #mov $inp,8(%rsp) # save inp, 2nd arg - mov %rdx,16(%rsp) # save end pointer, "3rd" arg -___ -$code.=<<___ if ($win64); - movaps %xmm6,0x20(%rsp) -.cfi_offset %xmm6,-0x78 - movaps %xmm7,0x30(%rsp) -.cfi_offset %xmm7,-0x68 - movaps %xmm8,0x40(%rsp) -.cfi_offset %xmm8,-0x58 - movaps %xmm9,0x50(%rsp) -.cfi_offset %xmm9,-0x48 -___ -$code.=<<___; - mov %rsp,%rbp -.cfi_def_cfa_register %rbp -.cfi_end_prologue - - lea -16*$SZ(%rsp),%rsp - mov $SZ*0($ctx),$A - and \$-64,%rsp # align stack - mov $SZ*1($ctx),$B - mov $SZ*2($ctx),$C - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H -___ - -$code.=<<___; - #movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4 - #movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5 - jmp .Lloop_ssse3 -.align 16 -.Lloop_ssse3: - movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 - mov $inp,$_inp # offload $inp - movdqu 0x00($inp),@X[0] - movdqu 0x10($inp),@X[1] - movdqu 0x20($inp),@X[2] - pshufb $t3,@X[0] - movdqu 0x30($inp),@X[3] - lea $TABLE(%rip),$Tbl - pshufb $t3,@X[1] - movdqa 0x00($Tbl),$t0 - movdqa 0x10($Tbl),$t1 - pshufb $t3,@X[2] - paddd @X[0],$t0 - movdqa 0x20($Tbl),$t2 - pshufb $t3,@X[3] - movdqa 0x30($Tbl),$t3 - paddd @X[1],$t1 - paddd @X[2],$t2 - paddd @X[3],$t3 - movdqa $t0,0x00(%rsp) - mov $A,$a1 - movdqa $t1,0x10(%rsp) - mov $B,$a3 - movdqa $t2,0x20(%rsp) - xor $C,$a3 # magic - movdqa $t3,0x30(%rsp) - mov $E,$a0 - jmp .Lssse3_00_47 - -.align 16 -.Lssse3_00_47: - sub \$`-16*$SZ`,$Tbl # size optimization -___ -sub Xupdate_256_SSSE3 () { - ( - '&movdqa ($t0,@X[1]);', - '&movdqa ($t3,@X[3])', - '&palignr ($t0,@X[0],$SZ)', # X[1..4] - '&palignr ($t3,@X[2],$SZ);', # X[9..12] - '&movdqa ($t1,$t0)', - '&movdqa ($t2,$t0);', - '&psrld ($t0,$sigma0[2])', - '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] - '&psrld ($t2,$sigma0[0])', - '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] - '&pslld ($t1,8*$SZ-$sigma0[1]);'. - '&pxor ($t0,$t2)', - '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. - '&pxor ($t0,$t1)', - '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. - '&pxor ($t0,$t2);', - '&movdqa ($t2,$t3)', - '&pxor ($t0,$t1);', # sigma0(X[1..4]) - '&psrld ($t3,$sigma1[2])', - '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) - '&psrlq ($t2,$sigma1[0])', - '&pxor ($t3,$t2);', - '&psrlq ($t2,$sigma1[1]-$sigma1[0])', - '&pxor ($t3,$t2)', - '&pshufb ($t3,$t4)', # sigma1(X[14..15]) - '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) - '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] - '&movdqa ($t2,$t3);', - '&psrld ($t3,$sigma1[2])', - '&psrlq ($t2,$sigma1[0])', - '&pxor ($t3,$t2);', - '&psrlq ($t2,$sigma1[1]-$sigma1[0])', - '&pxor ($t3,$t2);', - '&movdqa ($t2,16*$j."($Tbl)")', - '&pshufb ($t3,$t5)', - '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) - ); -} - -sub SSSE3_256_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 104 instructions - - if (0) { - foreach (Xupdate_256_SSSE3()) { # 36 instructions - eval; - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - } - } else { # squeeze extra 4% on Westmere and 19% on Atom - eval(shift(@insns)); #@ - &movdqa ($t0,@X[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t3,@X[3]); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &palignr ($t0,@X[0],$SZ); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - &palignr ($t3,@X[2],$SZ); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &movdqa ($t1,$t0); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t2,$t0); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &psrld ($t0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[0],$t3); # X[0..3] += X[9..12] - eval(shift(@insns)); #@ - eval(shift(@insns)); - &psrld ($t2,$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &pshufd ($t3,@X[3],0b11111010); # X[4..15] - eval(shift(@insns)); - eval(shift(@insns)); #@ - &pslld ($t1,8*$SZ-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t0,$t2); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &psrld ($t2,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - &pxor ($t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - &pslld ($t1,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t0,$t2); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &movdqa ($t2,$t3); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t0,$t1); # sigma0(X[1..4]) - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - &psrld ($t3,$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); #@ - eval(shift(@insns)); - &psrlq ($t2,$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t3,$t2); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &psrlq ($t2,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t3,$t2); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - #&pshufb ($t3,$t4); # sigma1(X[14..15]) - &pshufd ($t3,$t3,0b10000000); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &psrldq ($t3,8); - eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &pshufd ($t3,@X[0],0b01010000); # X[16..17] - eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &movdqa ($t2,$t3); - eval(shift(@insns)); - eval(shift(@insns)); - &psrld ($t3,$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); #@ - &psrlq ($t2,$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t3,$t2); - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &psrlq ($t2,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &pxor ($t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); #@ - #&pshufb ($t3,$t5); - &pshufd ($t3,$t3,0b00001000); - eval(shift(@insns)); - eval(shift(@insns)); - &movdqa ($t2,16*$j."($Tbl)"); - eval(shift(@insns)); #@ - eval(shift(@insns)); - &pslldq ($t3,8); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); #@ - eval(shift(@insns)); - eval(shift(@insns)); - } - &paddd ($t2,@X[0]); - foreach (@insns) { eval; } # remaining instructions - &movdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<4; $j++) { - &SSSE3_256_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); - &jne (".Lssse3_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } -$code.=<<___; - mov $_ctx,$ctx - mov $a1,$A - mov $_inp,$inp - - add $SZ*0($ctx),$A - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - lea 16*$SZ($inp),$inp - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - jb .Lloop_ssse3 - - xorps %xmm0, %xmm0 - lea $framesz+6*8(%rbp),%r11 -.cfi_def_cfa %r11,8 - movaps %xmm0, 0x00(%rsp) # scrub the stack - movaps %xmm0, 0x10(%rsp) - movaps %xmm0, 0x20(%rsp) - movaps %xmm0, 0x30(%rsp) -___ -$code.=<<___ if ($win64); - movaps 0x20(%rbp),%xmm6 - movaps 0x30(%rbp),%xmm7 - movaps 0x40(%rbp),%xmm8 - movaps 0x50(%rbp),%xmm9 -___ -$code.=<<___; - mov $framesz(%rbp),%r15 -.cfi_restore %r15 - mov -40(%r11),%r14 -.cfi_restore %r14 - mov -32(%r11),%r13 -.cfi_restore %r13 - mov -24(%r11),%r12 -.cfi_restore %r12 - mov -16(%r11),%rbx -.cfi_restore %rbx - mov -8(%r11),%rbp -.cfi_restore %rbp -.cfi_epilogue - lea (%r11),%rsp - ret -.cfi_endproc -.size ${func},.-${func} -___ -} -}}} -{ -my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order - ("%rdi","%rsi","%rdx"); # Unix order -$code.=<<___; -.globl ${pre}sha256_emit -.hidden ${pre}sha256_emit -.type ${pre}sha256_emit,\@abi-omnipotent -.align 16 -${pre}sha256_emit: - mov 0($inp), %r8 - mov 8($inp), %r9 - mov 16($inp), %r10 - bswap %r8 - mov 24($inp), %r11 - bswap %r9 - mov %r8d, 4($out) - bswap %r10 - mov %r9d, 12($out) - bswap %r11 - mov %r10d, 20($out) - shr \$32, %r8 - mov %r11d, 28($out) - shr \$32, %r9 - mov %r8d, 0($out) - shr \$32, %r10 - mov %r9d, 8($out) - shr \$32, %r11 - mov %r10d, 16($out) - mov %r11d, 24($out) - ret -.size ${pre}sha256_emit,.-${pre}sha256_emit - -.globl ${pre}sha256_bcopy -.hidden ${pre}sha256_bcopy -.type ${pre}sha256_bcopy,\@abi-omnipotent -.align 16 -${pre}sha256_bcopy: - sub $inp, $out -.Loop_bcopy: - movzb ($inp), %eax - lea 1($inp), $inp - mov %al, -1($out,$inp) - dec $len - jnz .Loop_bcopy - ret -.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy - -.globl ${pre}sha256_hcopy -.hidden ${pre}sha256_hcopy -.type ${pre}sha256_hcopy,\@abi-omnipotent -.align 16 -${pre}sha256_hcopy: - mov 0($inp), %r8 - mov 8($inp), %r9 - mov 16($inp), %r10 - mov 24($inp), %r11 - mov %r8, 0($out) - mov %r9, 8($out) - mov %r10, 16($out) - mov %r11, 24($out) - ret -.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy -___ -} - -sub sha256op38 { - my $instr = shift; - my %opcodelet = ( - "sha256rnds2" => 0xcb, - "sha256msg1" => 0xcc, - "sha256msg2" => 0xcd ); - - if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { - my @opcode=(0x0f,0x38); - push @opcode,$opcodelet{$instr}; - push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M - return ".byte\t".join(',',@opcode); - } else { - return $instr."\t".@_[0]; - } -} - -foreach (split("\n",$code)) { - s/\`([^\`]*)\`/eval $1/geo; - - s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; - - print $_,"\n"; -} -close STDOUT; diff --git a/crypto/blst_src/asm/x86_64-xlate.pl b/crypto/blst_src/asm/x86_64-xlate.pl deleted file mode 100755 index 62be619d9fc..00000000000 --- a/crypto/blst_src/asm/x86_64-xlate.pl +++ /dev/null @@ -1,1781 +0,0 @@ -#!/usr/bin/env perl -# -# Copyright Supranational LLC -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 -# -# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm. -# -# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T -# format is way easier to parse. Because it's simpler to "gear" from -# Unix ABI to Windows one [see cross-reference "card" at the end of -# file]. Because Linux targets were available first... -# -# In addition the script also "distills" code suitable for GNU -# assembler, so that it can be compiled with more rigid assemblers, -# such as Solaris /usr/ccs/bin/as. -# -# This translator is not designed to convert *arbitrary* assembler -# code from AT&T format to MASM one. It's designed to convert just -# enough to provide for dual-ABI OpenSSL modules development... -# There *are* limitations and you might have to modify your assembler -# code or this script to achieve the desired result... -# -# Currently recognized limitations: -# -# - can't use multiple ops per line; -# -# Dual-ABI styling rules. -# -# 1. Adhere to Unix register and stack layout [see cross-reference -# ABI "card" at the end for explanation]. -# 2. Forget about "red zone," stick to more traditional blended -# stack frame allocation. If volatile storage is actually required -# that is. If not, just leave the stack as is. -# 3. Functions tagged with ".type name,@function" get crafted with -# unified Win64 prologue and epilogue automatically. If you want -# to take care of ABI differences yourself, tag functions as -# ".type name,@abi-omnipotent" instead. -# 4. To optimize the Win64 prologue you can specify number of input -# arguments as ".type name,@function,N." Keep in mind that if N is -# larger than 6, then you *have to* write "abi-omnipotent" code, -# because >6 cases can't be addressed with unified prologue. -# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: -# (sorry about latter). -# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is -# required to identify the spots, where to inject Win64 epilogue! -# But on the pros, it's then prefixed with rep automatically:-) -# 7. Stick to explicit ip-relative addressing. If you have to use -# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. -# Both are recognized and translated to proper Win64 addressing -# modes. -# -# 8. In order to provide for structured exception handling unified -# Win64 prologue copies %rsp value to %rax. [Unless function is -# tagged with additional .type tag.] For further details see SEH -# paragraph at the end. -# 9. .init segment is allowed to contain calls to functions only. -# a. If function accepts more than 4 arguments *and* >4th argument -# is declared as non 64-bit value, do clear its upper part. - - -use strict; - -my $flavour = shift; -my $output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -open STDOUT,">$output" || die "can't open $output: $!" - if (defined($output)); - -my $gas=1; $gas=0 if ($output =~ /\.asm$/); -my $elf=1; $elf=0 if (!$gas); -my $dwarf=$elf; -my $win64=0; -my $prefix=""; -my $decor=".L"; - -my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 -my $masm=0; -my $PTR=" PTR"; - -my $nasmref=2.03; -my $nasm=0; - -if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; - $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`; - $prefix =~ s|\R$||; # Better chomp - } -elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } -elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } -elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } -elsif (!$gas) -{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) - { $nasm = $1 + $2*0.01; $PTR=""; } - elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) - { $masm = $1 + $2*2**-16 + $4*2**-32; } - die "no assembler found on %PATH%" if (!($nasm || $masm)); - $win64=1; - $elf=0; - $decor="\$L\$"; -} - -$dwarf=0 if($win64); - -my $current_segment; -my $current_function; -my %globals; - -{ package opcode; # pick up opcodes - sub re { - my ($class, $line) = @_; - my $self = {}; - my $ret; - - if ($$line =~ /^([a-z][a-z0-9]*)/i) { - bless $self,$class; - $self->{op} = $1; - $ret = $self; - $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; - - undef $self->{sz}; - if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... - $self->{op} = $1; - $self->{sz} = $2; - } elsif ($self->{op} =~ /cmov[n]?[lb]$/) { - # pass through - } elsif ($self->{op} =~ /call|jmp/) { - $self->{sz} = ""; - } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn - $self->{sz} = ""; - } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov - $self->{sz} = ""; - } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { - $self->{sz} = ""; - } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { - $self->{op} = $1; - $self->{sz} = $2; - } - } - $ret; - } - sub size { - my ($self, $sz) = @_; - $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); - $self->{sz}; - } - sub out { - my $self = shift; - if ($gas) { - if ($self->{op} eq "movz") { # movz is pain... - sprintf "%s%s%s",$self->{op},$self->{sz},shift; - } elsif ($self->{op} =~ /^set/) { - "$self->{op}"; - } elsif ($self->{op} eq "ret") { - my $epilogue = ""; - if ($win64 && $current_function->{abi} eq "svr4" - && !$current_function->{unwind}) { - $epilogue = "movq 8(%rsp),%rdi\n\t" . - "movq 16(%rsp),%rsi\n\t"; - } - $epilogue . ".byte 0xf3,0xc3"; - } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { - ".p2align\t3\n\t.quad"; - } else { - "$self->{op}$self->{sz}"; - } - } else { - $self->{op} =~ s/^movz/movzx/; - if ($self->{op} eq "ret") { - $self->{op} = ""; - if ($win64 && $current_function->{abi} eq "svr4" - && !$current_function->{unwind}) { - $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". - "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; - } - $self->{op} .= "DB\t0F3h,0C3h\t\t;repret"; - } elsif ($self->{op} =~ /^(pop|push)f/) { - $self->{op} .= $self->{sz}; - } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { - $self->{op} = "\tDQ"; - } - $self->{op}; - } - } - sub mnemonic { - my ($self, $op) = @_; - $self->{op}=$op if (defined($op)); - $self->{op}; - } -} -{ package const; # pick up constants, which start with $ - sub re { - my ($class, $line) = @_; - my $self = {}; - my $ret; - - if ($$line =~ /^\$([^,]+)/) { - bless $self, $class; - $self->{value} = $1; - $ret = $self; - $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; - } - $ret; - } - sub out { - my $self = shift; - - $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; - if ($gas) { - # Solaris /usr/ccs/bin/as can't handle multiplications - # in $self->{value} - my $value = $self->{value}; - no warnings; # oct might complain about overflow, ignore here... - $value =~ s/(?{value} = $value; - } - sprintf "\$%s",$self->{value}; - } else { - my $value = $self->{value}; - $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); - sprintf "%s",$value; - } - } -} -{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) - - my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", - l=>"DWORD$PTR", d=>"DWORD$PTR", - q=>"QWORD$PTR", o=>"OWORD$PTR", - x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", - z=>"ZMMWORD$PTR" ) if (!$gas); - - my %sifmap = ( ss=>"d", sd=>"q", # broadcast only - i32x2=>"q", f32x2=>"q", - i32x4=>"x", i64x2=>"x", i128=>"x", - f32x4=>"x", f64x2=>"x", f128=>"x", - i32x8=>"y", i64x4=>"y", - f32x8=>"y", f64x4=>"y" ) if (!$gas); - - sub re { - my ($class, $line, $opcode) = @_; - my $self = {}; - my $ret; - - # optional * ----vvv--- appears in indirect jmp/call - if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) { - bless $self, $class; - $self->{asterisk} = $1; - $self->{label} = $2; - ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); - $self->{scale} = 1 if (!defined($self->{scale})); - $self->{opmask} = $4; - $ret = $self; - $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; - - if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { - die if ($opcode->mnemonic() ne "mov"); - $opcode->mnemonic("lea"); - } - $self->{base} =~ s/^%//; - $self->{index} =~ s/^%// if (defined($self->{index})); - $self->{opcode} = $opcode; - } - $ret; - } - sub size {} - sub out { - my ($self, $sz) = @_; - - $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; - $self->{label} =~ s/\.L/$decor/g; - - # Silently convert all EAs to 64-bit. This is required for - # elder GNU assembler and results in more compact code, - # *but* most importantly AES module depends on this feature! - $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; - $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; - - # Solaris /usr/ccs/bin/as can't handle multiplications - # in $self->{label}... - use integer; - $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; - - # Some assemblers insist on signed presentation of 32-bit - # offsets, but sign extension is a tricky business in perl... - $self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg; - - # if base register is %rbp or %r13, see if it's possible to - # flip base and index registers [for better performance] - if (!$self->{label} && $self->{index} && $self->{scale}==1 && - $self->{base} =~ /(rbp|r13)/) { - $self->{base} = $self->{index}; $self->{index} = $1; - } - - if ($gas) { - $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); - - if (defined($self->{index})) { - sprintf "%s%s(%s,%%%s,%d)%s", - $self->{asterisk},$self->{label}, - $self->{base}?"%$self->{base}":"", - $self->{index},$self->{scale}, - $self->{opmask}; - } else { - sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, - $self->{base},$self->{opmask}; - } - } else { - $self->{label} =~ s/\./\$/g; - $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); - - my $mnemonic = $self->{opcode}->mnemonic(); - ($self->{asterisk}) && ($sz="q") || - ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || - ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || - ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || - ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/) - && ($sz=$sifmap{$1}); - - $self->{opmask} =~ s/%(k[0-7])/$1/; - - if (defined($self->{index})) { - sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, - $self->{label}?"$self->{label}+":"", - $self->{index},$self->{scale}, - $self->{base}?"+$self->{base}":"", - $self->{opmask}; - } elsif ($self->{base} eq "rip") { - sprintf "%s[%s]",$szmap{$sz},$self->{label}; - } else { - sprintf "%s[%s%s]%s", $szmap{$sz}, - $self->{label}?"$self->{label}+":"", - $self->{base},$self->{opmask}; - } - } - } -} -{ package register; # pick up registers, which start with %. - sub re { - my ($class, $line, $opcode) = @_; - my $self = {}; - my $ret; - - # optional * ----vvv--- appears in indirect jmp/call - if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { - bless $self,$class; - $self->{asterisk} = $1; - $self->{value} = $2; - $self->{opmask} = $3; - $opcode->size($self->size()); - $ret = $self; - $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; - } - $ret; - } - sub size { - my $self = shift; - my $ret; - - if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } - elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } - elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } - elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } - elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } - elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } - elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } - elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } - - $ret; - } - sub out { - my $self = shift; - if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, - $self->{value}, - $self->{opmask}; } - else { $self->{opmask} =~ s/%(k[0-7])/$1/; - $self->{value}.$self->{opmask}; } - } -} -{ package label; # pick up labels, which end with : - sub re { - my ($class, $line) = @_; - my $self = {}; - my $ret; - - if ($$line =~ /(^[\.\w]+)\:/) { - bless $self,$class; - $self->{value} = $1; - $ret = $self; - $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; - - $self->{value} =~ s/^\.L/$decor/; - } - $ret; - } - sub out { - my $self = shift; - - if ($gas) { - my $func = ($globals{$self->{value}} or $self->{value}) . ":"; - if ($current_function->{name} eq $self->{value}) { - $func .= "\n.cfi_".cfi_directive::startproc() if ($dwarf); - $func .= "\n .byte 0xf3,0x0f,0x1e,0xfa\n"; # endbranch - if ($win64 && $current_function->{abi} eq "svr4") { - my $fp = $current_function->{unwind} ? "%r11" : "%rax"; - $func .= " movq %rdi,8(%rsp)\n"; - $func .= " movq %rsi,16(%rsp)\n"; - $func .= " movq %rsp,$fp\n"; - $func .= "${decor}SEH_begin_$current_function->{name}:\n"; - my $narg = $current_function->{narg}; - $narg=6 if (!defined($narg)); - $func .= " movq %rcx,%rdi\n" if ($narg>0); - $func .= " movq %rdx,%rsi\n" if ($narg>1); - $func .= " movq %r8,%rdx\n" if ($narg>2); - $func .= " movq %r9,%rcx\n" if ($narg>3); - $func .= " movq 40(%rsp),%r8\n" if ($narg>4); - $func .= " movq 48(%rsp),%r9\n" if ($narg>5); - } - } - $func; - } elsif ($self->{value} ne "$current_function->{name}") { - # Make all labels in masm global. - $self->{value} .= ":" if ($masm); - $self->{value} . ":"; - } elsif ($win64 && $current_function->{abi} eq "svr4") { - my $func = "$current_function->{name}" . - ($nasm ? ":" : "\tPROC $current_function->{scope}") . - "\n"; - my $fp = $current_function->{unwind} ? "r11" : "rax"; - $func .= " DB 243,15,30,250\n"; # endbranch - $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; - $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; - $func .= " mov $fp,rsp\n"; - $func .= "${decor}SEH_begin_$current_function->{name}:"; - $func .= ":" if ($masm); - $func .= "\n"; - my $narg = $current_function->{narg}; - $narg=6 if (!defined($narg)); - $func .= " mov rdi,rcx\n" if ($narg>0); - $func .= " mov rsi,rdx\n" if ($narg>1); - $func .= " mov rdx,r8\n" if ($narg>2); - $func .= " mov rcx,r9\n" if ($narg>3); - $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4); - $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5); - $func .= "\n"; - } else { - "$current_function->{name}". - ($nasm ? ":" : "\tPROC $current_function->{scope}"). - "\n DB 243,15,30,250"; # endbranch - } - } -} -{ package expr; # pick up expressions - sub re { - my ($class, $line, $opcode) = @_; - my $self = {}; - my $ret; - - if ($$line =~ /(^[^,]+)/) { - bless $self,$class; - $self->{value} = $1; - $ret = $self; - $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; - - $self->{value} =~ s/\@PLT// if (!$elf); - $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; - $self->{value} =~ s/\.L/$decor/g; - $self->{opcode} = $opcode; - } - $ret; - } - sub out { - my $self = shift; - $self->{value}; - } -} - -my @xdata_seg = (".section .xdata", ".align 8"); -my @pdata_seg = (".section .pdata", ".align 4"); - -{ package cfi_directive; - # CFI directives annotate instructions that are significant for - # stack unwinding procedure compliant with DWARF specification, - # see http://dwarfstd.org/. Besides naturally expected for this - # script platform-specific filtering function, this module adds - # three auxiliary synthetic directives not recognized by [GNU] - # assembler: - # - # - .cfi_push to annotate push instructions in prologue, which - # translates to .cfi_adjust_cfa_offset (if needed) and - # .cfi_offset; - # - .cfi_pop to annotate pop instructions in epilogue, which - # translates to .cfi_adjust_cfa_offset (if needed) and - # .cfi_restore; - # - [and most notably] .cfi_cfa_expression which encodes - # DW_CFA_def_cfa_expression and passes it to .cfi_escape as - # byte vector; - # - # CFA expressions were introduced in DWARF specification version - # 3 and describe how to deduce CFA, Canonical Frame Address. This - # becomes handy if your stack frame is variable and you can't - # spare register for [previous] frame pointer. Suggested directive - # syntax is made-up mix of DWARF operator suffixes [subset of] - # and references to registers with optional bias. Following example - # describes offloaded *original* stack pointer at specific offset - # from *current* stack pointer: - # - # .cfi_cfa_expression %rsp+40,deref,+8 - # - # Final +8 has everything to do with the fact that CFA is defined - # as reference to top of caller's stack, and on x86_64 call to - # subroutine pushes 8-byte return address. In other words original - # stack pointer upon entry to a subroutine is 8 bytes off from CFA. - # - # In addition the .cfi directives are re-purposed even for Win64 - # stack unwinding. Two more synthetic directives were added: - # - # - .cfi_end_prologue to denote point when all non-volatile - # registers are saved and stack or [chosen] frame pointer is - # stable; - # - .cfi_epilogue to denote point when all non-volatile registers - # are restored [and it even adds missing .cfi_restore-s]; - # - # Though it's not universal "miracle cure," it has its limitations. - # Most notably .cfi_cfa_expression won't start working... For more - # information see the end of this file. - - # Below constants are taken from "DWARF Expressions" section of the - # DWARF specification, section is numbered 7.7 in versions 3 and 4. - my %DW_OP_simple = ( # no-arg operators, mapped directly - deref => 0x06, dup => 0x12, - drop => 0x13, over => 0x14, - pick => 0x15, swap => 0x16, - rot => 0x17, xderef => 0x18, - - abs => 0x19, and => 0x1a, - div => 0x1b, minus => 0x1c, - mod => 0x1d, mul => 0x1e, - neg => 0x1f, not => 0x20, - or => 0x21, plus => 0x22, - shl => 0x24, shr => 0x25, - shra => 0x26, xor => 0x27, - ); - - my %DW_OP_complex = ( # used in specific subroutines - constu => 0x10, # uleb128 - consts => 0x11, # sleb128 - plus_uconst => 0x23, # uleb128 - lit0 => 0x30, # add 0-31 to opcode - reg0 => 0x50, # add 0-31 to opcode - breg0 => 0x70, # add 0-31 to opcole, sleb128 - regx => 0x90, # uleb28 - fbreg => 0x91, # sleb128 - bregx => 0x92, # uleb128, sleb128 - piece => 0x93, # uleb128 - ); - - # Following constants are defined in x86_64 ABI supplement, for - # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, - # see section 3.7 "Stack Unwind Algorithm". - my %DW_reg_idx = ( - "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, - "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, - "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, - "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 - ); - - my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs); - my @cfa_stack; - - # [us]leb128 format is variable-length integer representation base - # 2^128, with most significant bit of each byte being 0 denoting - # *last* most significant digit. See "Variable Length Data" in the - # DWARF specification, numbered 7.6 at least in versions 3 and 4. - sub sleb128 { - use integer; # get right shift extend sign - - my $val = shift; - my $sign = ($val < 0) ? -1 : 0; - my @ret = (); - - while(1) { - push @ret, $val&0x7f; - - # see if remaining bits are same and equal to most - # significant bit of the current digit, if so, it's - # last digit... - last if (($val>>6) == $sign); - - @ret[-1] |= 0x80; - $val >>= 7; - } - - return @ret; - } - sub uleb128 { - my $val = shift; - my @ret = (); - - while(1) { - push @ret, $val&0x7f; - - # see if it's last significant digit... - last if (($val >>= 7) == 0); - - @ret[-1] |= 0x80; - } - - return @ret; - } - sub const { - my $val = shift; - - if ($val >= 0 && $val < 32) { - return ($DW_OP_complex{lit0}+$val); - } - return ($DW_OP_complex{consts}, sleb128($val)); - } - sub reg { - my $val = shift; - - return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); - - my $reg = $DW_reg_idx{$1}; - my $off = eval ("0 $2 $3"); - - return (($DW_OP_complex{breg0} + $reg), sleb128($off)); - # Yes, we use DW_OP_bregX+0 to push register value and not - # DW_OP_regX, because latter would require even DW_OP_piece, - # which would be a waste under the circumstances. If you have - # to use DWP_OP_reg, use "regx:N"... - } - sub cfa_expression { - my $line = shift; - my @ret; - - foreach my $token (split(/,\s*/,$line)) { - if ($token =~ /^%r/) { - push @ret,reg($token); - } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { - push @ret,reg("$2+$1"); - } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { - my $i = 1*eval($2); - push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); - } elsif (my $i = 1*eval($token) or $token eq "0") { - if ($token =~ /^\+/) { - push @ret,$DW_OP_complex{plus_uconst},uleb128($i); - } else { - push @ret,const($i); - } - } else { - push @ret,$DW_OP_simple{$token}; - } - } - - # Finally we return DW_CFA_def_cfa_expression, 15, followed by - # length of the expression and of course the expression itself. - return (15,scalar(@ret),@ret); - } - - # Following constants are defined in "x64 exception handling" at - # https://docs.microsoft.com/ and match the register sequence in - # CONTEXT structure defined in winnt.h. - my %WIN64_reg_idx = ( - "%rax"=>0, "%rcx"=>1, "%rdx"=>2, "%rbx"=>3, - "%rsp"=>4, "%rbp"=>5, "%rsi"=>6, "%rdi"=>7, - "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, - "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 - ); - sub xdata { - our @dat = (); - our $len = 0; - - sub allocstack { - my $offset = shift; - - if ($offset) { - if ($offset <= 128) { - $offset = ($offset - 8) >> 3; - push @dat, [0,$offset<<4|2]; # UWOP_ALLOC_SMALL - } elsif ($offset < 0x80000) { - push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))]; - } else { - push @dat, [0,0x11,unpack("C4",pack("V",$offset))]; - } - $len += $#{@dat[-1]}+1; - } - } - - # allocate stack frame - if (my $offset = -8 - $cfa_rsp) { - # but see if frame pointer is among saved registers - if ($cfa_reg ne "%rsp" and my $fp_off = $saved_regs{$cfa_reg}) { - $fp_off = -8 - $fp_off; - allocstack($fp_off-8); - $offset -= $fp_off; - push @dat, [0,$WIN64_reg_idx{$cfa_reg}<<4]; # UWOP_PUSH_NONVOL - $len += $#{@dat[-1]}+1; - } - allocstack($offset); - } - # set up frame pointer - my $fp_info = 0; - if ($cfa_reg ne "%rsp") { - my $offset = $cfa_off - $cfa_rsp; - ($offset > 240 or $offset&0xf) and die "invalid FP offset $offset"; - $fp_info = ($offset&-16)|$WIN64_reg_idx{$cfa_reg}; - push @dat, [0,3]; # UWOP_SET_FPREG - $len += $#{@dat[-1]}+1; - } - # save registers - foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} } - keys(%saved_regs)) { - next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key); - my $offset = $saved_regs{$key} - $cfa_rsp; - if ($key =~ /%xmm([0-9]+)/) { - if ($offset < 0x100000) { - push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))]; - } else { - push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))]; - } - } else { - if ($offset < 0x80000) { - push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4, - unpack("C2",pack("v",$offset>>3))]; - } else { - push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5, - unpack("C4",pack("V",$offset))]; - } - } - $len += $#{@dat[-1]}+1; - } - - my @ret; - # generate 4-byte descriptor - push @ret, ".byte 1,0,".($len/2).",$fp_info"; - $len += 4; - # pad to 8*n - unshift @dat, [(0)x((-$len)&7)] if ($len&7); - # emit data - while(defined(my $row = pop @dat)) { - push @ret, ".byte ". join(",", - map { sprintf "0x%02x",$_ } @{$row}); - } - - return @ret; - } - sub startproc { - return if ($cfa_rsp == -8); - ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8); - %saved_regs = (); - return "startproc"; - } - sub endproc { - return if ($cfa_rsp == 0); - ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0); - %saved_regs = (); - return "endproc"; - } - sub re { - my ($class, $line) = @_; - my $self = {}; - my $ret; - - if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { - bless $self,$class; - $ret = $self; - undef $self->{value}; - my $dir = $1; - - SWITCH: for ($dir) { - # What is $cfa_rsp? Effectively it's difference between %rsp - # value and current CFA, Canonical Frame Address, which is - # why it starts with -8. Recall that CFA is top of caller's - # stack... - /startproc/ && do { $dir = startproc(); last; }; - /endproc/ && do { $dir = endproc(); - # .cfi_remember_state directives that are not - # matched with .cfi_restore_state are - # unnecessary. - die "unpaired .cfi_remember_state" if (@cfa_stack); - last; - }; - /def_cfa_register/ - && do { $cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp"); - $cfa_reg = $$line; - last; - }; - /def_cfa_offset/ - && do { $cfa_off = -1*eval($$line); - $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); - last; - }; - /adjust_cfa_offset/ - && do { my $val = 1*eval($$line); - $cfa_off -= $val; - if ($cfa_reg eq "%rsp") { - $cfa_rsp -= $val; - } - last; - }; - /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { - $cfa_reg = $1; - $cfa_off = -1*eval($2); - $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); - } - last; - }; - /push/ && do { $dir = undef; - $cfa_rsp -= 8; - if ($cfa_reg eq "%rsp") { - $cfa_off = $cfa_rsp; - $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; - } - $saved_regs{$$line} = $cfa_rsp; - $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; - last; - }; - /pop/ && do { $dir = undef; - $cfa_rsp += 8; - if ($cfa_reg eq "%rsp") { - $cfa_off = $cfa_rsp; - $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; - } - $self->{value} .= ".cfi_restore\t$$line"; - delete $saved_regs{$$line}; - last; - }; - /cfa_expression/ - && do { $dir = undef; - $self->{value} = ".cfi_escape\t" . - join(",", map(sprintf("0x%02x", $_), - cfa_expression($$line))); - last; - }; - /remember_state/ - && do { push @cfa_stack, - [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs]; - last; - }; - /restore_state/ - && do { ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs) - = @{pop @cfa_stack}; - last; - }; - /offset/ && do { if ($$line =~ /(%\w+)\s*,\s*(.+)/) { - $saved_regs{$1} = 1*eval($2); - $dir = undef if ($1 =~ /%xmm/); - } - last; - }; - /restore/ && do { delete $saved_regs{$$line}; last; }; - /end_prologue/ - && do { $dir = undef; - $self->{win64} = ".endprolog"; - last; - }; - /epilogue/ && do { $dir = undef; - $self->{win64} = ".epilogue"; - $self->{value} = join("\n", - map { ".cfi_restore\t$_" } - sort keys(%saved_regs)); - %saved_regs = (); - last; - }; - } - - $self->{value} = ".cfi_$dir\t$$line" if ($dir); - - $$line = ""; - } - - return $ret; - } - sub out { - my $self = shift; - return $self->{value} if ($dwarf); - - if ($win64 and $current_function->{unwind} - and my $ret = $self->{win64}) { - my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/) ? ($', $cfa_off) - : ("rsp", $cfa_rsp); - my $fname = $current_function->{name}; - - if ($ret eq ".endprolog") { - $saved_regs{"%rdi"} = 0; # relative to CFA, remember? - $saved_regs{"%rsi"} = 8; - - push @pdata_seg, - ".rva .LSEH_begin_${fname}", - ".rva .LSEH_body_${fname}", - ".rva .LSEH_info_${fname}_prologue",""; - push @xdata_seg, - ".LSEH_info_${fname}_prologue:", - ".byte 1,0,5,0x0b", # 5 unwind codes, %r11 is FP - ".byte 0,0x74,1,0", # %rdi at 8(%rsp) - ".byte 0,0x64,2,0", # %rsi at 16(%rsp) - ".byte 0,0x03", # set frame pointer - ".byte 0,0" # padding - ; - push @pdata_seg, - ".rva .LSEH_body_${fname}", - ".rva .LSEH_epilogue_${fname}", - ".rva .LSEH_info_${fname}_body",""; - push @xdata_seg,".LSEH_info_${fname}_body:", xdata(); - $ret = "${decor}SEH_body_${fname}:"; - $ret .= ":" if ($masm); $ret .= "\n"; - } elsif ($ret eq ".epilogue") { - %saved_regs = (); - $saved_regs{"%rdi"} = 0; # relative to CFA, remember? - $saved_regs{"%rsi"} = 8; - $cfa_rsp = $cfa_off; - - push @pdata_seg, - ".rva .LSEH_epilogue_${fname}", - ".rva .LSEH_end_${fname}", - ".rva .LSEH_info_${fname}_epilogue",""; - push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), ""; - $ret = "${decor}SEH_epilogue_${fname}:"; - $ret .= ":" if ($masm); $ret .= "\n"; - if ($gas) { - $ret .= " mov ".(0-$off)."(%$reg),%rdi\n"; - $ret .= " mov ".(8-$off)."(%$reg),%rsi\n"; - } else { - $ret .= " mov rdi,QWORD$PTR\[".(0-$off)."+$reg\]"; - $ret .= " ;WIN64 epilogue\n"; - $ret .= " mov rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n"; - } - } - return $ret; - } - return; - } -} -{ package directive; # pick up directives, which start with . - sub re { - my ($class, $line) = @_; - my $self = {}; - my $ret; - my $dir; - - # chain-call to cfi_directive - $ret = cfi_directive->re($line) and return $ret; - - if ($$line =~ /^\s*(\.\w+)/) { - bless $self,$class; - $dir = $1; - $ret = $self; - undef $self->{value}; - $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; - - SWITCH: for ($dir) { - /\.global|\.globl|\.extern/ - && do { $globals{$$line} = $prefix . $$line; - $$line = $globals{$$line} if ($prefix); - last; - }; - /\.type/ && do { my ($sym,$type,$narg,$unwind) = split(',',$$line); - if ($type eq "\@function") { - undef $current_function; - $current_function->{name} = $sym; - $current_function->{abi} = "svr4"; - $current_function->{narg} = $narg; - $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; - $current_function->{unwind} = $unwind; - } elsif ($type eq "\@abi-omnipotent") { - undef $current_function; - $current_function->{name} = $sym; - $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; - } - $$line =~ s/\@abi\-omnipotent/\@function/; - $$line =~ s/\@function.*/\@function/; - last; - }; - /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { - $dir = ".byte"; - $$line = join(",",unpack("C*",$1),0); - } - last; - }; - /\.rva|\.long|\.quad/ - && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; - $$line =~ s/\.L/$decor/g; - last; - }; - } - - if ($gas) { - $self->{value} = $dir . "\t" . $$line; - - if ($dir =~ /\.extern/) { - $self->{value} = ""; # swallow extern - } elsif (!$elf && $dir =~ /\.type/) { - $self->{value} = ""; - $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . - (defined($globals{$1})?".scl 2;":".scl 3;") . - "\t.type 32;\t.endef" - if ($win64 && $$line =~ /([^,]+),\@function/); - } elsif ($dir =~ /\.size/) { - $self->{value} = "" if (!$elf); - if ($dwarf and my $endproc = cfi_directive::endproc()) { - $self->{value} = ".cfi_$endproc\n$self->{value}"; - } elsif (!$elf && defined($current_function)) { - $self->{value} .= "${decor}SEH_end_$current_function->{name}:" - if ($win64 && $current_function->{abi} eq "svr4"); - undef $current_function; - } - } elsif (!$elf && $dir =~ /\.align/) { - $self->{value} = ".p2align\t" . (log($$line)/log(2)); - } elsif ($dir eq ".section") { - $current_segment=$$line; - if (!$elf && $current_segment eq ".init") { - if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } - elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } - } - } elsif ($dir =~ /\.(text|data)/) { - $current_segment=".$1"; - } elsif ($dir =~ /\.hidden/) { - if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } - elsif ($flavour eq "mingw64") { $self->{value} = ""; } - } elsif ($dir =~ /\.comm/) { - $self->{value} = "$dir\t$prefix$$line"; - $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); - } - $$line = ""; - return $self; - } - - # non-gas case or nasm/masm - SWITCH: for ($dir) { - /\.text/ && do { my $v=undef; - if ($nasm) { - $v="section .text code align=64\n"; - } else { - $v="$current_segment\tENDS\n" if ($current_segment); - $current_segment = ".text\$"; - $v.="$current_segment\tSEGMENT "; - $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; - $v.=" 'CODE'"; - } - $self->{value} = $v; - last; - }; - /\.data/ && do { my $v=undef; - if ($nasm) { - $v="section .data data align=8\n"; - } else { - $v="$current_segment\tENDS\n" if ($current_segment); - $current_segment = "_DATA"; - $v.="$current_segment\tSEGMENT"; - } - $self->{value} = $v; - last; - }; - /\.section/ && do { my $v=undef; - $$line =~ s/([^,]*).*/$1/; - $$line = ".CRT\$XCU" if ($$line eq ".init"); - if ($nasm) { - $v="section $$line"; - if ($$line=~/\.([px])data/) { - $v.=" rdata align="; - $v.=$1 eq "p"? 4 : 8; - } elsif ($$line=~/\.CRT\$/i) { - $v.=" rdata align=8"; - } - } else { - $v="$current_segment\tENDS\n" if ($current_segment); - $v.="$$line\tSEGMENT"; - if ($$line=~/\.([px])data/) { - $v.=" READONLY"; - $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); - } elsif ($$line=~/\.CRT\$/i) { - $v.=" READONLY "; - $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; - } - } - $current_segment = $$line; - $self->{value} = $v; - last; - }; - /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; - $self->{value} .= ":NEAR" if ($masm); - last; - }; - /\.globl|.global/ - && do { $self->{value} = $masm?"PUBLIC":"global"; - $self->{value} .= "\t".$$line; - last; - }; - /\.size/ && do { if (defined($current_function)) { - undef $self->{value}; - if ($current_function->{abi} eq "svr4") { - $self->{value}="${decor}SEH_end_$current_function->{name}:"; - $self->{value}.=":\n" if($masm); - } - $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); - undef $current_function; - } - last; - }; - /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; - $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); - last; - }; - /\.(value|long|rva|quad)/ - && do { my $sz = substr($1,0,1); - my @arr = split(/,\s*/,$$line); - my $last = pop(@arr); - my $conv = sub { my $var=shift; - $var=~s/^(0b[0-1]+)/oct($1)/eig; - $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); - if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) - { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } - $var; - }; - - $sz =~ tr/bvlrq/BWDDQ/; - $self->{value} = "\tD$sz\t"; - for (@arr) { $self->{value} .= &$conv($_).","; } - $self->{value} .= &$conv($last); - last; - }; - /\.byte/ && do { my @str=split(/,\s*/,$$line); - map(s/(0b[0-1]+)/oct($1)/eig,@str); - map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); - while ($#str>15) { - $self->{value}.="DB\t" - .join(",",@str[0..15])."\n"; - foreach (0..15) { shift @str; } - } - $self->{value}.="DB\t" - .join(",",@str) if (@str); - last; - }; - /\.comm/ && do { my @str=split(/,\s*/,$$line); - my $v=undef; - if ($nasm) { - $v.="common $prefix@str[0] @str[1]"; - } else { - $v="$current_segment\tENDS\n" if ($current_segment); - $current_segment = "_DATA"; - $v.="$current_segment\tSEGMENT\n"; - $v.="COMM @str[0]:DWORD:".@str[1]/4; - } - $self->{value} = $v; - last; - }; - } - $$line = ""; - } - - $ret; - } - sub out { - my $self = shift; - $self->{value}; - } -} - -# Upon initial x86_64 introduction SSE>2 extensions were not introduced -# yet. In order not to be bothered by tracing exact assembler versions, -# but at the same time to provide a bare security minimum of AES-NI, we -# hard-code some instructions. Extensions past AES-NI on the other hand -# are traced by examining assembler version in individual perlasm -# modules... - -my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, - "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); - -sub rex { - my $opcode=shift; - my ($dst,$src,$rex)=@_; - - $rex|=0x04 if($dst>=8); - $rex|=0x01 if($src>=8); - push @$opcode,($rex|0x40) if ($rex); -} - -my $movq = sub { # elderly gas can't handle inter-register movq - my $arg = shift; - my @opcode=(0x66); - if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { - my ($src,$dst)=($1,$2); - if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,$src,$dst,0x8); - push @opcode,0x0f,0x7e; - push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M - @opcode; - } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { - my ($src,$dst)=($2,$1); - if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,$src,$dst,0x8); - push @opcode,0x0f,0x6e; - push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M - @opcode; - } else { - (); - } -}; - -my $pextrd = sub { - if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { - my @opcode=(0x66); - my $imm=$1; - my $src=$2; - my $dst=$3; - if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } - elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } - rex(\@opcode,$src,$dst); - push @opcode,0x0f,0x3a,0x16; - push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M - push @opcode,$imm; - @opcode; - } else { - (); - } -}; - -my $pinsrd = sub { - if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { - my @opcode=(0x66); - my $imm=$1; - my $src=$2; - my $dst=$3; - if ($src =~ /%r([0-9]+)/) { $src = $1; } - elsif ($src =~ /%e/) { $src = $regrm{$src}; } - rex(\@opcode,$dst,$src); - push @opcode,0x0f,0x3a,0x22; - push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M - push @opcode,$imm; - @opcode; - } else { - (); - } -}; - -my $pshufb = sub { - if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x66); - rex(\@opcode,$2,$1); - push @opcode,0x0f,0x38,0x00; - push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M - @opcode; - } else { - (); - } -}; - -my $palignr = sub { - if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x66); - rex(\@opcode,$3,$2); - push @opcode,0x0f,0x3a,0x0f; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - push @opcode,$1; - @opcode; - } else { - (); - } -}; - -my $pclmulqdq = sub { - if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x66); - rex(\@opcode,$3,$2); - push @opcode,0x0f,0x3a,0x44; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - my $c=$1; - push @opcode,$c=~/^0/?oct($c):$c; - @opcode; - } else { - (); - } -}; - -my $rdrand = sub { - if (shift =~ /%[er](\w+)/) { - my @opcode=(); - my $dst=$1; - if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,0,$dst,8); - push @opcode,0x0f,0xc7,0xf0|($dst&7); - @opcode; - } else { - (); - } -}; - -my $rdseed = sub { - if (shift =~ /%[er](\w+)/) { - my @opcode=(); - my $dst=$1; - if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } - rex(\@opcode,0,$dst,8); - push @opcode,0x0f,0xc7,0xf8|($dst&7); - @opcode; - } else { - (); - } -}; - -# Not all AVX-capable assemblers recognize AMD XOP extension. Since we -# are using only two instructions hand-code them in order to be excused -# from chasing assembler versions... - -sub rxb { - my $opcode=shift; - my ($dst,$src1,$src2,$rxb)=@_; - - $rxb|=0x7<<5; - $rxb&=~(0x04<<5) if($dst>=8); - $rxb&=~(0x01<<5) if($src1>=8); - $rxb&=~(0x02<<5) if($src2>=8); - push @$opcode,$rxb; -} - -my $vprotd = sub { - if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x8f); - rxb(\@opcode,$3,$2,-1,0x08); - push @opcode,0x78,0xc2; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - my $c=$1; - push @opcode,$c=~/^0/?oct($c):$c; - @opcode; - } else { - (); - } -}; - -my $vprotq = sub { - if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { - my @opcode=(0x8f); - rxb(\@opcode,$3,$2,-1,0x08); - push @opcode,0x78,0xc3; - push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M - my $c=$1; - push @opcode,$c=~/^0/?oct($c):$c; - @opcode; - } else { - (); - } -}; - -# Intel Control-flow Enforcement Technology extension. All functions and -# indirect branch targets will have to start with this instruction... -# However, it should not be used in functions' prologues explicitly, as -# it's added automatically [and in the right spot]. Which leaves only -# non-function indirect branch targets, such as in a case-like dispatch -# table, as application area. - -my $endbr64 = sub { - (0xf3,0x0f,0x1e,0xfa); -}; - -######################################################################## - -if ($nasm) { - print <<___; -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -___ -} elsif ($masm) { - print <<___; -OPTION DOTNAME -___ -} - -sub process { - my $line = shift; - - $line =~ s|\R$||; # Better chomp - - $line =~ s|[#!].*$||; # get rid of asm-style comments... - $line =~ s|/\*.*\*/||; # ... and C-style comments... - $line =~ s|^\s+||; # ... and skip white spaces in beginning - $line =~ s|\s+$||; # ... and at the end - - if (my $label=label->re(\$line)) { print $label->out(); } - - if (my $directive=directive->re(\$line)) { - printf "%s",$directive->out(); - } elsif (my $opcode=opcode->re(\$line)) { - my $asm = eval("\$".$opcode->mnemonic()); - - if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { - print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; - next; - } - - my @args; - ARGUMENT: while (1) { - my $arg; - - ($arg=register->re(\$line, $opcode))|| - ($arg=const->re(\$line)) || - ($arg=ea->re(\$line, $opcode)) || - ($arg=expr->re(\$line, $opcode)) || - last ARGUMENT; - - push @args,$arg; - - last ARGUMENT if ($line !~ /^,/); - - $line =~ s/^,\s*//; - } # ARGUMENT: - - if ($#args>=0) { - my $insn; - my $sz=$opcode->size(); - - if ($gas) { - $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); - @args = map($_->out($sz),@args); - printf "\t%s\t%s",$insn,join(",",@args); - } else { - $insn = $opcode->out(); - foreach (@args) { - my $arg = $_->out(); - # $insn.=$sz compensates for movq, pinsrw, ... - if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } - if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } - if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } - if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } - } - @args = reverse(@args); - undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); - printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); - } - } else { - printf "\t%s",$opcode->out(); - } - } - - print $line,"\n"; -} - -while(<>) { process($_); } - -map { process($_) } @pdata_seg if ($win64); -map { process($_) } @xdata_seg if ($win64); - -# platform-specific epilogue -if ($masm) { - print "\n$current_segment\tENDS\n" if ($current_segment); - print "END\n"; -} elsif ($elf) { - # -fcf-protection segment, snatched from compiler -S output - my $align = ($flavour =~ /elf32/) ? 4 : 8; - print <<___; - -.section .note.GNU-stack,"",\@progbits -.section .note.gnu.property,"a",\@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align $align -2: -___ -} - -close STDOUT; - - ################################################# -# Cross-reference x86_64 ABI "card" -# -# Unix Win64 -# %rax * * -# %rbx - - -# %rcx #4 #1 -# %rdx #3 #2 -# %rsi #2 - -# %rdi #1 - -# %rbp - - -# %rsp - - -# %r8 #5 #3 -# %r9 #6 #4 -# %r10 * * -# %r11 * * -# %r12 - - -# %r13 - - -# %r14 - - -# %r15 - - -# -# (*) volatile register -# (-) preserved by callee -# (#) Nth argument, volatile -# -# In Unix terms top of stack is argument transfer area for arguments -# which could not be accommodated in registers. Or in other words 7th -# [integer] argument resides at 8(%rsp) upon function entry point. -# 128 bytes above %rsp constitute a "red zone" which is not touched -# by signal handlers and can be used as temporal storage without -# allocating a frame. -# -# In Win64 terms N*8 bytes on top of stack is argument transfer area, -# which belongs to/can be overwritten by callee. N is the number of -# arguments passed to callee, *but* not less than 4! This means that -# upon function entry point 5th argument resides at 40(%rsp), as well -# as that 32 bytes from 8(%rsp) can always be used as temporal -# storage [without allocating a frame]. One can actually argue that -# one can assume a "red zone" above stack pointer under Win64 as well. -# Point is that at apparently no occasion Windows kernel would alter -# the area above user stack pointer in true asynchronous manner... -# -# All the above means that if assembler programmer adheres to Unix -# register and stack layout, but disregards the "red zone" existence, -# it's possible to use following prologue and epilogue to "gear" from -# Unix to Win64 ABI in leaf functions with not more than 6 arguments. -# -# omnipotent_function: -# ifdef WIN64 -# movq %rdi,8(%rsp) -# movq %rsi,16(%rsp) -# movq %rcx,%rdi ; if 1st argument is actually present -# movq %rdx,%rsi ; if 2nd argument is actually ... -# movq %r8,%rdx ; if 3rd argument is ... -# movq %r9,%rcx ; if 4th argument ... -# movq 40(%rsp),%r8 ; if 5th ... -# movq 48(%rsp),%r9 ; if 6th ... -# endif -# ... -# ifdef WIN64 -# movq 8(%rsp),%rdi -# movq 16(%rsp),%rsi -# endif -# ret -# - ################################################# -# Win64 SEH, Structured Exception Handling. -# -# Unlike on Unix systems(*) lack of Win64 stack unwinding information -# has undesired side-effect at run-time: if an exception is raised in -# assembler subroutine such as those in question (basically we're -# referring to segmentation violations caused by malformed input -# parameters), the application is briskly terminated without invoking -# any exception handlers, most notably without generating memory dump -# or any user notification whatsoever. This poses a problem. It's -# possible to address it by registering custom language-specific -# handler that would restore processor context to the state at -# subroutine entry point and return "exception is not handled, keep -# unwinding" code. Writing such handler can be a challenge... But it's -# doable, though requires certain coding convention. Consider following -# snippet: -# -# .type function,@function -# function: -# movq %rsp,%rax # copy rsp to volatile register -# pushq %r15 # save non-volatile registers -# pushq %rbx -# pushq %rbp -# movq %rsp,%r11 -# subq %rdi,%r11 # prepare [variable] stack frame -# andq $-64,%r11 -# movq %rax,0(%r11) # check for exceptions -# movq %r11,%rsp # allocate [variable] stack frame -# movq %rax,0(%rsp) # save original rsp value -# magic_point: -# ... -# movq 0(%rsp),%rcx # pull original rsp value -# movq -24(%rcx),%rbp # restore non-volatile registers -# movq -16(%rcx),%rbx -# movq -8(%rcx),%r15 -# movq %rcx,%rsp # restore original rsp -# magic_epilogue: -# ret -# .size function,.-function -# -# The key is that up to magic_point copy of original rsp value remains -# in chosen volatile register and no non-volatile register, except for -# rsp, is modified. While past magic_point rsp remains constant till -# the very end of the function. In this case custom language-specific -# exception handler would look like this: -# -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -# { ULONG64 *rsp = (ULONG64 *)context->Rax; -# ULONG64 rip = context->Rip; -# -# if (rip >= magic_point) -# { rsp = (ULONG64 *)context->Rsp; -# if (rip < magic_epilogue) -# { rsp = (ULONG64 *)rsp[0]; -# context->Rbp = rsp[-3]; -# context->Rbx = rsp[-2]; -# context->R15 = rsp[-1]; -# } -# } -# context->Rsp = (ULONG64)rsp; -# context->Rdi = rsp[1]; -# context->Rsi = rsp[2]; -# -# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); -# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, -# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, -# &disp->HandlerData,&disp->EstablisherFrame,NULL); -# return ExceptionContinueSearch; -# } -# -# It's appropriate to implement this handler in assembler, directly in -# function's module. In order to do that one has to know members' -# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant -# values. Here they are: -# -# CONTEXT.Rax 120 -# CONTEXT.Rcx 128 -# CONTEXT.Rdx 136 -# CONTEXT.Rbx 144 -# CONTEXT.Rsp 152 -# CONTEXT.Rbp 160 -# CONTEXT.Rsi 168 -# CONTEXT.Rdi 176 -# CONTEXT.R8 184 -# CONTEXT.R9 192 -# CONTEXT.R10 200 -# CONTEXT.R11 208 -# CONTEXT.R12 216 -# CONTEXT.R13 224 -# CONTEXT.R14 232 -# CONTEXT.R15 240 -# CONTEXT.Rip 248 -# CONTEXT.Xmm6 512 -# sizeof(CONTEXT) 1232 -# DISPATCHER_CONTEXT.ControlPc 0 -# DISPATCHER_CONTEXT.ImageBase 8 -# DISPATCHER_CONTEXT.FunctionEntry 16 -# DISPATCHER_CONTEXT.EstablisherFrame 24 -# DISPATCHER_CONTEXT.TargetIp 32 -# DISPATCHER_CONTEXT.ContextRecord 40 -# DISPATCHER_CONTEXT.LanguageHandler 48 -# DISPATCHER_CONTEXT.HandlerData 56 -# UNW_FLAG_NHANDLER 0 -# ExceptionContinueSearch 1 -# -# In order to tie the handler to the function one has to compose -# couple of structures: one for .xdata segment and one for .pdata. -# -# UNWIND_INFO structure for .xdata segment would be -# -# function_unwind_info: -# .byte 9,0,0,0 -# .rva handler -# -# This structure designates exception handler for a function with -# zero-length prologue, no stack frame or frame register. -# -# To facilitate composing of .pdata structures, auto-generated "gear" -# prologue copies rsp value to rax and denotes next instruction with -# .LSEH_begin_{function_name} label. This essentially defines the SEH -# styling rule mentioned in the beginning. Position of this label is -# chosen in such manner that possible exceptions raised in the "gear" -# prologue would be accounted to caller and unwound from latter's frame. -# End of function is marked with respective .LSEH_end_{function_name} -# label. To summarize, .pdata segment would contain -# -# .rva .LSEH_begin_function -# .rva .LSEH_end_function -# .rva function_unwind_info -# -# Reference to function_unwind_info from .xdata segment is the anchor. -# In case you wonder why references are 32-bit .rvas and not 64-bit -# .quads. References put into these two segments are required to be -# *relative* to the base address of the current binary module, a.k.a. -# image base. No Win64 module, be it .exe or .dll, can be larger than -# 2GB and thus such relative references can be and are accommodated in -# 32 bits. -# -# Having reviewed the example function code, one can argue that "movq -# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix -# rax would contain an undefined value. If this "offends" you, use -# another register and refrain from modifying rax till magic_point is -# reached, i.e. as if it was a non-volatile register. If more registers -# are required prior [variable] frame setup is completed, note that -# nobody says that you can have only one "magic point." You can -# "liberate" non-volatile registers by denoting last stack off-load -# instruction and reflecting it in finer grade unwind logic in handler. -# After all, isn't it why it's called *language-specific* handler... -# -# SE handlers are also involved in unwinding stack when executable is -# profiled or debugged. Profiling implies additional limitations that -# are too subtle to discuss here. For now it's sufficient to say that -# in order to simplify handlers one should either a) offload original -# %rsp to stack (like discussed above); or b) if you have a register to -# spare for frame pointer, choose volatile one. -# -# (*) Note that we're talking about run-time, not debug-time. Lack of -# unwind information makes debugging hard on both Windows and -# Unix. "Unlike" refers to the fact that on Unix signal handler -# will always be invoked, core dumped and appropriate exit code -# returned to parent (for user notification). -# -######################################################################## -# As of May 2020 an alternative approach that works with both exceptions -# and debugging/profiling was implemented by re-purposing DWARF .cfi -# annotations even for Win64 unwind tables' generation. Unfortunately, -# but not really unexpectedly, it imposes additional limitations on -# coding style. Probably most significant limitation is that frame -# pointer has to be at 16*n distance from stack pointer at the exit -# from prologue. But first things first. There are two additional -# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue, -# that need to be added to all functions marked with additional .type -# tag (see example below). There are "do's and don'ts" for prologue -# and epilogue. It shouldn't come as surprise that in prologue one may -# not modify non-volatile registers, but one may not modify %r11 either. -# This is because it's used as temporary frame pointer(*). There is one -# exception to this rule, and it's setting up frame pointer that is -# non-volatile or %r11. But it must be last instruction in the prologue. -# Constraints for epilogue, or rather on its boundary, depend on whether -# the frame is fixed- or variable-length. In fixed-frame subroutine -# stack pointer has to be restored in the last instruction prior the -# .cfi_epilogue directive. If it's variable-frame subroutine, and a -# non-volatile register was used as frame pointer, then last instruction -# prior the directive has to restore its original value. This means that -# final stack pointer adjustment would have to be pushed past the -# directive. Normally this would render the epilogue non-unwindable, so -# special care has to be taken. To resolve the dilemma, copy frame -# pointer to a volatile register in advance. To give an example: -# -# .type rbp_as_frame_pointer,\@function,3,"unwind" # mind extra tag! -# rbp_as_frame_pointer: -# .cfi_startproc -# push %rbp -# .cfi_push %rbp -# push %rbx -# .cfi_push %rbx -# mov %rsp,%rbp # last instruction in prologue -# .cfi_def_cfa_register %rbp # %rsp-%rbp has to be 16*n, e.g. 16*0 -# .cfi_end_prologue -# sub \$40,%rsp -# and \$-64,%rsp -# ... -# mov %rbp,%r11 -# .cfi_def_cfa_register %r11 # copy frame pointer to volatile %r11 -# mov 0(%rbp),%rbx -# mov 8(%rbp),%rbp # last instruction prior epilogue -# .cfi_epilogue # may not change %r11 in epilogue -# lea 16(%r11),%rsp -# ret -# .cfi_endproc -# .size rbp_as_frame_pointer,.-rbp_as_frame_pointer -# -# To give an example of fixed-frame subroutine for reference: -# -# .type fixed_frame,\@function,3,"unwind" # mind extra tag! -# fixed_frame: -# .cfi_startproc -# push %rbp -# .cfi_push %rbp -# push %rbx -# .cfi_push %rbx -# sub \$40,%rsp -# .cfi_adjust_cfa_offset 40 -# .cfi_end_prologue -# ... -# mov 40(%rsp),%rbx -# mov 48(%rsp),%rbp -# lea 56(%rsp),%rsp -# .cfi_adjust_cfa_offset -56 -# .cfi_epilogue -# ret -# .cfi_endproc -# .size fixed_frame,.-fixed_frame -# -# As for epilogue itself, one can only work on non-volatile registers. -# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi. -# -# On a final note, mixing old-style and modernized subroutines in the -# same file takes some trickery. Ones of the new kind have to appear -# after old-style ones. This has everything to do with the fact that -# entries in the .pdata segment have to appear in strictly same order -# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION -# structures get mechanically appended to whatever existing .pdata. -# -# (*) Just in case, why %r11 and not %rax. This has everything to do -# with the way UNWIND_INFO is, one just can't designate %rax as -# frame pointer. diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c index 0fcf563f502..df11e3dae73 100644 --- a/crypto/blst_src/client_min_pk.c +++ b/crypto/blst_src/client_min_pk.c @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "keygen.c" +/*#include "keygen.c" #include "e2.c" #include "hash_to_field.c" #include "map_to_g2.c" @@ -14,4 +14,4 @@ #include "recip.c" #include "consts.c" #include "vect.c" -#include "exports.c" +#include "exports.c"*/ diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c index 8e4663daede..fffbd5ad52d 100644 --- a/crypto/blst_src/client_min_sig.c +++ b/crypto/blst_src/client_min_sig.c @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "keygen.c" +/*#include "keygen.c" #include "e1.c" #include "hash_to_field.c" #include "map_to_g1.c" @@ -14,4 +14,4 @@ #include "recip.c" #include "consts.c" #include "vect.c" -#include "exports.c" +#include "exports.c"*/ diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 451c1b8a180..175fe5acb0a 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -3,7 +3,7 @@ package crypto -// #cgo CFLAGS: -g -Wall -std=c99 +// #cgo CFLAGS: // #include "dkg_include.h" import "C" diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 335ce6fc86d..ecb26f7d6e3 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -3,7 +3,7 @@ package crypto -// #cgo CFLAGS: -g -Wall -std=c99 +// #cgo CFLAGS: // #include "dkg_include.h" import "C" diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index 7b63f88e810..d79379f7d83 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -3,7 +3,7 @@ package crypto -// #cgo CFLAGS: -g -Wall -std=c99 +// #cgo CFLAGS: // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "dkg_include.h" import "C" diff --git a/crypto/spock.go b/crypto/spock.go index 2487f39ce1b..a4087316319 100644 --- a/crypto/spock.go +++ b/crypto/spock.go @@ -6,7 +6,7 @@ package crypto // SPoCK design based on the BLS signature scheme. // BLS is using BLS12-381 curve and the same settings in bls.go. -// #cgo CFLAGS: -g -Wall -std=c99 +// #cgo CFLAGS: // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "bls_include.h" import "C" From 3c0247cfc9c72c0c9e75231b9769fb0e84fff26d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 14 Feb 2023 01:06:20 -0600 Subject: [PATCH 007/200] include blst.h --- crypto/blst_include.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 586f6069590..4ac79c7723b 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -4,4 +4,6 @@ // blst related definitions // eventually this file would replace blst.h +#include "blst.h" + #endif \ No newline at end of file From 79601b66a999ef80067af66063085a929fde7f65 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 14 Feb 2023 01:09:09 -0600 Subject: [PATCH 008/200] tidy go.mod after removing blst package --- crypto/go.mod | 2 -- crypto/go.sum | 4 ---- 2 files changed, 6 deletions(-) diff --git a/crypto/go.mod b/crypto/go.mod index c7fe54f9ff5..57c20ef9341 100644 --- a/crypto/go.mod +++ b/crypto/go.mod @@ -6,10 +6,8 @@ require ( github.com/btcsuite/btcd/btcec/v2 v2.2.1 github.com/sirupsen/logrus v1.4.2 github.com/stretchr/testify v1.8.0 - github.com/supranational/blst v0.3.10 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d gonum.org/v1/gonum v0.6.1 - pgregory.net/rapid v0.4.7 ) require ( diff --git a/crypto/go.sum b/crypto/go.sum index 19a05d05d6d..181f9b302c0 100644 --- a/crypto/go.sum +++ b/crypto/go.sum @@ -28,8 +28,6 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk= -github.com/supranational/blst v0.3.10/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d h1:sK3txAijHtOK88l68nt020reeT1ZdKLIYetKl95FzVY= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -54,6 +52,4 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= -pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= From e64cc36a82377a77b594db460799b892de1f8cab Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 14 Feb 2023 01:14:40 -0600 Subject: [PATCH 009/200] add missing relic flags --- crypto/blst_include.h | 2 ++ crypto/blst_src.c | 10 +--------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 4ac79c7723b..dde3acd5f05 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -1,3 +1,5 @@ +// +build relic + #ifndef __BLST_INCLUDE_H__ #define __BLST_INCLUDE_H__ diff --git a/crypto/blst_src.c b/crypto/blst_src.c index c124bcec078..89388b703fe 100644 --- a/crypto/blst_src.c +++ b/crypto/blst_src.c @@ -1,8 +1,4 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ +// +build relic #include "keygen.c" #include "hash_to_field.c" @@ -21,7 +17,3 @@ #include "consts.c" #include "vect.c" #include "exports.c" -#include "rb_tree.c" -#ifdef BLST_FR_PENTAROOT -# include "pentaroot.c" -#endif From e51d94e1744b6d0c92a76b4db77c2ac8d093df24 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 17 Feb 2023 00:43:00 -0600 Subject: [PATCH 010/200] first iteration of scalar type change --- crypto/bls.go | 58 ++++++-------- crypto/bls12381_utils.c | 121 +++++++++++++++++------------ crypto/bls12381_utils.go | 65 ++++++++-------- crypto/bls12381_utils.h | 39 ++++++---- crypto/bls12381_utils_test.go | 19 +++-- crypto/bls_core.c | 7 +- crypto/bls_include.h | 2 +- crypto/bls_multisig.go | 43 +++++----- crypto/bls_test.go | 18 +++-- crypto/bls_thresholdsign.go | 24 +++--- crypto/bls_thresholdsign_core.c | 2 +- crypto/bls_thresholdsign_include.h | 2 +- crypto/bls_thresholdsign_test.go | 2 + crypto/blst_include.h | 11 ++- crypto/dkg.go | 5 +- crypto/dkg_core.c | 6 +- crypto/dkg_feldmanvss.go | 26 ++++--- crypto/dkg_feldmanvssq.go | 17 ++-- crypto/dkg_include.h | 4 +- crypto/dkg_jointfeldman.go | 11 +-- crypto/dkg_test.go | 3 +- crypto/thresholdsign.go | 8 +- 22 files changed, 269 insertions(+), 224 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 6786f00c4d5..f062fa50f5a 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -166,12 +166,9 @@ func (sk *prKeyBLSBLS12381) Sign(data []byte, kmac hash.Hasher) (Signature, erro // hash the input to 128 bytes h := kmac.ComputeHash(data) - // set BLS context - blsInstance.reInit() - s := make([]byte, SignatureLenBLSBLS12381) C.bls_sign((*C.uchar)(&s[0]), - (*C.bn_st)(&sk.scalar), + (*C.Fr)(&sk.scalar), (*C.uchar)(&h[0]), (C.int)(len(h))) return s, nil @@ -203,9 +200,6 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) return false, err } - // intialize BLS context - blsInstance.reInit() - if len(s) != signatureLengthBLSBLS12381 { return false, nil } @@ -292,18 +286,24 @@ func BLSInvalidSignature() Signature { // decodePrivateKey decodes a slice of bytes into a private key. // It checks the scalar is non-zero and is less than the group order. func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, error) { - if len(privateKeyBytes) != prKeyLengthBLSBLS12381 { - return nil, invalidInputsErrorf("input length must be %d, got %d", - prKeyLengthBLSBLS12381, len(privateKeyBytes)) - } sk := newPrKeyBLSBLS12381(nil) - readScalar(&sk.scalar, privateKeyBytes) - if C.check_membership_Zr_star((*C.bn_st)(&sk.scalar)) == valid { + read := C.Fr_star_read_bytes( + (*C.Fr)(&sk.scalar), + (*C.uchar)(&privateKeyBytes[0]), + (C.int)(prKeyLengthBLSBLS12381)) + + switch int(read) { + case blst_valid: return sk, nil + case blst_bad_encoding: + return nil, invalidInputsErrorf("input length must be %d, got %d", + prKeyLengthBLSBLS12381, len(privateKeyBytes)) + case blst_bad_scalar: + return nil, invalidInputsErrorf("the private key is not in the correct range for the BLS12-381 curve") + default: + return nil, invalidInputsErrorf("reading the private key failed") } - - return nil, invalidInputsErrorf("the private key is not a valid BLS12-381 curve key") } // decodePublicKey decodes a slice of bytes into a public key. @@ -356,16 +356,13 @@ type prKeyBLSBLS12381 struct { // If no scalar is provided, the function allocates an // empty scalar. func newPrKeyBLSBLS12381(x *scalar) *prKeyBLSBLS12381 { - var sk prKeyBLSBLS12381 - if x == nil { - // initialize the scalar - C.bn_new_wrapper((*C.bn_st)(&sk.scalar)) - } else { - // set the scalar - sk.scalar = *x + if x != nil { + return &prKeyBLSBLS12381{ + // the embedded public key is only computed when needed + scalar: *x, + } } - // the embedded public key is only computed when needed - return &sk + return &prKeyBLSBLS12381{} } // Algorithm returns the Signing Algorithm @@ -415,7 +412,7 @@ func (sk *prKeyBLSBLS12381) Equals(other PrivateKey) bool { if !ok { return false } - return sk.scalar.equals(&otherBLS.scalar) + return (&sk.scalar).equals(&otherBLS.scalar) } // String returns the hex string representation of the key. @@ -520,15 +517,6 @@ func (a *blsBLS12381Algo) init() error { return nil } -// set the context of BLS 12-381 curve in the lower C and Relic layers assuming the context -// was previously initialized with a call to init(). -// -// If the implementation evolves to support multiple contexts, -// reinit should be called at every blsBLS12381Algo operation. -func (a *blsBLS12381Algo) reInit() { - a.context.setContext() -} - // This is only a TEST/DEBUG/BENCH function. // It returns the hash to G1 point from a slice of 128 bytes func mapToG1(data []byte) *pointG1 { @@ -556,7 +544,7 @@ func (sk *prKeyBLSBLS12381) signWithXMDSHA256(data []byte) Signature { // sign the hash s := make([]byte, SignatureLenBLSBLS12381) C.bls_sign((*C.uchar)(&s[0]), - (*C.bn_st)(&sk.scalar), + (*C.Fr)(&sk.scalar), (*C.uchar)(&hash[0]), (C.int)(len(hash))) return s diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index f8af1b0f073..fea72d33075 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -8,6 +8,16 @@ #include "bls_include.h" #include "assert.h" +// TODO: temp utility function to delete +bn_st* Fr_blst_to_relic(const Fr* x) { + bn_st* out = (bn_st*)malloc(sizeof(bn_st)); + byte* data = (byte*)malloc(Fr_BYTES); + be_bytes_from_limbs(data, (limb_t*)x, Fr_BYTES); + bn_read_bin(out, data, Fr_BYTES); + free(data); + return out; +} + // The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) // return macro values to the upper Go Layer @@ -120,33 +130,37 @@ void seed_relic(byte* seed, int len) { } // Exponentiation of a generic point p in G1 -void ep_mult(ep_t res, const ep_t p, const bn_t expo) { +void ep_mult(ep_t res, const ep_t p, const Fr *expo) { + bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using window NAF of size 2 - ep_mul_lwnaf(res, p, expo); + ep_mul_lwnaf(res, p, tmp_expo); } // Exponentiation of generator g1 in G1 // These two function are here for bench purposes only -void ep_mult_gen_bench(ep_t res, const bn_t expo) { +void ep_mult_gen_bench(ep_t res, const Fr* expo) { + bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using precomputed table of size 4 - ep_mul_gen(res, (bn_st *)expo); + ep_mul_gen(res, tmp_expo); } -void ep_mult_generic_bench(ep_t res, const bn_t expo) { +void ep_mult_generic_bench(ep_t res, const Fr* expo) { // generic point multiplication ep_mult(res, &core_get()->ep_g, expo); } // Exponentiation of a generic point p in G2 -void ep2_mult(ep2_t res, ep2_t p, bn_t expo) { +void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) { + bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using window NAF of size 2 - ep2_mul_lwnaf(res, p, expo); + ep2_mul_lwnaf(res, p, tmp_expo); } -// Exponentiation of fixed g2 in G2 -void ep2_mult_gen(ep2_t res, const bn_t expo) { +// Exponentiation of generator g2 in G2 +void ep2_mult_gen(ep2_t res, const Fr* expo) { + bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using precomputed table of size 4 - g2_mul_gen(res, (bn_st*)expo); + g2_mul_gen(res, (bn_st*)tmp_expo); } // DEBUG printing functions @@ -183,7 +197,7 @@ void ep2_print_(char* s, ep2_st* p) { } // generates a random number less than the order r -void bn_randZr_star(bn_t x) { +void bn_randZr_star(Fr* x) { // reduce the modular reduction bias const int seed_len = BITS_TO_BYTES(Fr_BITS + SEC_BITS); byte seed[seed_len]; @@ -192,33 +206,16 @@ void bn_randZr_star(bn_t x) { } // generates a random number less than the order r -void bn_randZr(bn_t x) { - bn_t r; - bn_new(r); - g2_get_ord(r); - // reduce the modular reduction bias - bn_new_size(x, BITS_TO_DIGITS(Fr_BITS + SEC_BITS)); - bn_rand(x, RLC_POS, Fr_BITS + SEC_BITS); - bn_mod(x, x, r); - bn_free(r); +void bn_randZr(Fr* x) { + // TODO: SEC_BITS bias reduction } -// reads a scalar from an array and maps it to Zr +// reads a scalar from an array and maps it to Fr // the resulting scalar is in the range 0 < a < r // len must be less than BITS_TO_BYTES(RLC_BN_BITS) -void bn_map_to_Zr_star(bn_t a, const uint8_t* bin, int len) { - bn_t tmp; - bn_new(tmp); - bn_new_size(tmp, BYTES_TO_DIGITS(len)); - bn_read_bin(tmp, bin, len); - bn_t r; - bn_new(r); - g2_get_ord(r); - bn_sub_dig(r,r,1); - bn_mod_basic(a,tmp,r); - bn_add_dig(a,a,1); - bn_free(r); - bn_free(tmp); +void bn_map_to_Zr_star(Fr* a, const uint8_t* bin, int len) { + // TODO: + // a = bin % (r-1) + 1 } // returns the sign of y. @@ -523,26 +520,50 @@ int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) { return RLC_ERR; } -// reads a scalar in a and checks it is a valid Zr element (a < r) -// returns RLC_OK if the scalar is valid and RLC_ERR otherwise. -int bn_read_Zr_bin(bn_t a, const uint8_t *bin, int len) { - if (len!=Fr_BYTES) { - return RLC_ERR; +bool_t Fr_is_zero(const Fr* a) { + return bytes_are_zero((const byte*)a, Fr_BYTES); +} + +bool_t Fr_is_equal(const Fr* a, const Fr* b) { + return vec_is_equal(a, b, Fr_BYTES); +} + +// reads a scalar in `a` and checks it is a valid Fr element (a < r). +// - BLST_BAD_ENCODING if the length is invalid +// - BLST_BAD_SCALAR if the scalar isn't in Fr +// - v if the scalar is valid +BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) { + if (len != Fr_BYTES) { + return BLST_BAD_ENCODING; } - bn_read_bin(a, bin, Fr_BYTES); - bn_t r; - bn_new(r); - g2_get_ord(r); - if (bn_cmp(a, r) == RLC_LT) { - return RLC_OK; + if (!check_mod_256(bin, BLS12_381_r)) { // check_mod_256 compares byte[] against a vec256! + return BLST_BAD_SCALAR; } - return RLC_ERR; + limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES); + return BLST_SUCCESS; +} + +// reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r). +// returns +// - BLST_BAD_ENCODING if the length is invalid +// - BLST_BAD_SCALAR if the scalar isn't in Fr_star +// - BLST_SUCCESS if the scalar is valid +BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) { + int ret = Fr_read_bytes(a, bin, len); + if (ret != BLST_SUCCESS) { + return ret; + } + // check if a=0 + if (Fr_is_zero(a)) { + return BLST_BAD_SCALAR; + } + return BLST_SUCCESS; } // computes the sum of the array elements x and writes the sum in jointx -// the sum is computed in Zr -void bn_sum_vector(bn_t jointx, const bn_st* x, const int len) { - bn_t r; +// the sum is computed in Fr +void Fr_sum_vector(Fr* jointx, const Fr* x, const int len) { + /*bn_t r; bn_new(r); g2_get_ord(r); bn_set_dig(jointx, 0); @@ -552,7 +573,7 @@ void bn_sum_vector(bn_t jointx, const bn_st* x, const int len) { if (bn_cmp(jointx, r) == RLC_GT) bn_sub(jointx, jointx, r); } - bn_free(r); + bn_free(r);*/ } // computes the sum of the G2 array elements y and writes the sum in jointy diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index fa931cffab6..8c6f1277842 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -17,11 +17,16 @@ import ( "errors" ) +// Go wrappers around BLST C types // Go wrappers around Relic C types -// Relic is compiled with ALLOC=AUTO type pointG1 C.ep_st type pointG2 C.ep2_st -type scalar C.bn_st +type scalar C.Fr + +// TODO: For now scalars are represented as field elements Fr since all scalars +// are less than r - check if distinguishing two types in necessary +//type pointG1_blst C.G1 +//type pointG2_blst C.G2 // context required for the BLS set-up type ctx struct { @@ -34,6 +39,12 @@ type ctx struct { var valid = C.get_valid() var invalid = C.get_invalid() +// get some constants from the C layer +// var blst_errors = C.blst_get_errors() +var blst_valid = (int)(C.BLST_SUCCESS) //int(blst_errors[0]) +var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING) // int(blst_errors[0]) +var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR) // int(blst_errors[0]) + // initContext sets relic B12_381 parameters and precomputes some data in the C layer func (ct *ctx) initContext() error { c := C.relic_init_BLS12_381() @@ -62,39 +73,32 @@ func seedRelic(seed []byte) error { return nil } -// setContext sets the context (previously initialized) of the C layer with -// pre-saved data. -func (ct *ctx) setContext() { - C.core_set(ct.relicCtx) - C.precomputed_data_set(ct.precCtx) -} - // Exponentiation in G1 (scalar point multiplication) func (p *pointG1) scalarMultG1(res *pointG1, expo *scalar) { - C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.bn_st)(expo)) + C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.Fr)(expo)) } // This function is for TEST only // Exponentiation of g1 in G1 func generatorScalarMultG1(res *pointG1, expo *scalar) { - C.ep_mult_gen_bench((*C.ep_st)(res), (*C.bn_st)(expo)) + C.ep_mult_gen_bench((*C.ep_st)(res), (*C.Fr)(expo)) } // This function is for TEST only // Generic Exponentiation G1 func genericScalarMultG1(res *pointG1, expo *scalar) { - C.ep_mult_generic_bench((*C.ep_st)(res), (*C.bn_st)(expo)) + C.ep_mult_generic_bench((*C.ep_st)(res), (*C.Fr)(expo)) } // Exponentiation of g2 in G2 func generatorScalarMultG2(res *pointG2, expo *scalar) { - C.ep2_mult_gen((*C.ep2_st)(res), (*C.bn_st)(expo)) + C.ep2_mult_gen((*C.ep2_st)(res), (*C.Fr)(expo)) } -// comparison in Zr where r is the group order of G1/G2 +// comparison in Fr where r is the group order of G1/G2 // (both scalars should be reduced mod r) func (x *scalar) equals(other *scalar) bool { - return C.bn_cmp((*C.bn_st)(x), (*C.bn_st)(other)) == valid + return C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other)) != 0 } // comparison in G2 @@ -102,10 +106,10 @@ func (p *pointG2) equals(other *pointG2) bool { return C.ep2_cmp((*C.ep2_st)(p), (*C.ep2_st)(other)) == valid } -// Comparison to zero in Zr. +// Comparison to zero in Fr. // Scalar must be already reduced modulo r func (x *scalar) isZero() bool { - return C.bn_is_zero((*C.bn_st)(x)) == 1 + return C.Fr_is_zero((*C.Fr)(x)) != 0 } // Comparison to point at infinity in G2. @@ -113,17 +117,17 @@ func (p *pointG2) isInfinity() bool { return C.ep2_is_infty((*C.ep2_st)(p)) == 1 } -// returns a random number in Zr +// returns a random number in Fr func randZr(x *scalar) { - C.bn_randZr((*C.bn_st)(x)) + //C.bn_randZr((*C.Fr)(x)) } -// returns a random non-zero number in Zr +// returns a random non-zero number in Fr func randZrStar(x *scalar) { - C.bn_randZr_star((*C.bn_st)(x)) + //C.bn_randZr_star((*C.Fr)(x)) } -// mapToZrStar reads a scalar from a slice of bytes and maps it to Zr +// mapToZrStar reads a scalar from a slice of bytes and maps it to Fr // the resulting scalar is in the range 0 < k < r func mapToZrStar(x *scalar, src []byte) error { if len(src) > maxScalarSize { @@ -131,7 +135,7 @@ func mapToZrStar(x *scalar, src []byte) error { "input slice length must be less than %d", maxScalarSize) } - C.bn_map_to_Zr_star((*C.bn_st)(x), + C.bn_map_to_Zr_star((*C.Fr)(x), (*C.uchar)(&src[0]), (C.int)(len(src))) return nil @@ -139,18 +143,11 @@ func mapToZrStar(x *scalar, src []byte) error { // writeScalar writes a G2 point in a slice of bytes func writeScalar(dest []byte, x *scalar) { - C.bn_write_bin((*C.uchar)(&dest[0]), + /*C.bn_write_bin((*C.uchar)(&dest[0]), (C.int)(prKeyLengthBLSBLS12381), - (*C.bn_st)(x), - ) -} - -// readScalar reads a scalar from a slice of bytes -func readScalar(x *scalar, src []byte) { - C.bn_read_bin((*C.bn_st)(x), - (*C.uchar)(&src[0]), - (C.int)(len(src)), - ) + (*C.Fr)(x), + )*/ + // TODO: to fill } // writePointG2 writes a G2 point in a slice of bytes diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index d6978d6188d..06bb81332fe 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -17,16 +17,18 @@ #define BITS_TO_BYTES(x) ((x+7)>>3) #define BITS_TO_DIGITS(x) ((x+63)>>6) #define BYTES_TO_DIGITS(x) ((x+7)>>3) +#define DIGITS_TO_BYTES(x) ((x)<<3) #define MIN(a,b) ((a)>(b)?(b):(a)) // Fields and Group serialization lengths #define SEC_BITS 128 #define Fp_BITS 381 -#define Fr_BITS 255 -#define Fp_BYTES BITS_TO_BYTES(Fp_BITS) #define Fp2_BYTES (2*Fp_BYTES) #define Fp_DIGITS BITS_TO_DIGITS(Fp_BITS) -#define Fr_BYTES BITS_TO_BYTES(Fr_BITS) +#define Fp_BYTES DIGITS_TO_BYTES(Fp_DIGITS) // BLST implements Fp as a limb array +#define Fr_BITS 255 +#define Fr_DIGITS BITS_TO_DIGITS(Fr_BITS) +#define Fr_BYTES DIGITS_TO_BYTES(Fr_DIGITS) // BLST implements Fr as a limb array #define G1_BYTES (2*Fp_BYTES) #define G2_BYTES (2*Fp2_BYTES) @@ -76,12 +78,19 @@ typedef struct prec_ { fp_t r; // Montgomery multiplication constant } prec_st; +// TODO: to delete when Relic is removed +bn_st* Fr_blst_to_relic(const Fr* x); + // BLS based SPoCK int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*); // hash to curve functions (functions in bls12381_hashtocurve.c) void map_to_G1(ep_t, const byte*, const int); +// Fr utilities +bool_t Fr_is_zero(const Fr* a); +bool_t Fr_is_equal(const Fr* a, const Fr* b); + // Utility functions int get_valid(); int get_invalid(); @@ -96,18 +105,22 @@ int ep_read_bin_compact(ep_t, const byte *, const int); void ep_write_bin_compact(byte *, const ep_t, const int); int ep2_read_bin_compact(ep2_t, const byte *, const int); void ep2_write_bin_compact(byte *, const ep2_t, const int); -int bn_read_Zr_bin(bn_t, const uint8_t *, int ); +BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len); +BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len); + + -void ep_mult_gen_bench(ep_t, const bn_t); -void ep_mult_generic_bench(ep_t, const bn_t); -void ep_mult(ep_t, const ep_t, const bn_t); -void ep2_mult_gen(ep2_t, const bn_t); +void ep_mult_gen_bench(ep_t, const Fr*); +void ep_mult_generic_bench(ep_t, const Fr*); +void ep_mult(ep_t, const ep_t, const Fr*); +void ep2_mult_gen(ep2_t, const Fr*); +void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); -void bn_randZr(bn_t); -void bn_randZr_star(bn_t); -void bn_map_to_Zr_star(bn_t, const uint8_t*, int); +void bn_randZr(Fr*); +void bn_randZr_star(Fr*); +void bn_map_to_Zr_star(Fr*, const uint8_t*, int); -void bn_sum_vector(bn_t, const bn_st*, const int); +void Fr_sum_vector(Fr*, const Fr*, const int); void ep_sum_vector(ep_t, ep_st*, const int); void ep2_sum_vector(ep2_t, ep2_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); @@ -116,7 +129,7 @@ void ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len); // membership checks int check_membership_G1(const ep_t); int check_membership_G2(const ep2_t); -int check_membership_Zr_star(const bn_t); +int check_membership_Fr_star(const bn_t); int simple_subgroup_check_G1(const ep_t); int simple_subgroup_check_G2(const ep2_t); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 8911ada1769..ce8f6d9df09 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -26,8 +26,8 @@ func TestDeterministicKeyGen(t *testing.T) { } // test the deterministicity of the relic PRG (used by the DKG polynomials) -func TestPRGseeding(t *testing.T) { - blsInstance.reInit() +/*func TestPRGseeding(t *testing.T) { + // 2 scalars generated with the same seed should be equal seed := make([]byte, KeyGenSeedMinLenBLSBLS12381) n, err := rand.Read(seed) @@ -37,24 +37,24 @@ func TestPRGseeding(t *testing.T) { err = seedRelic(seed) require.Nil(t, err) var sk1 prKeyBLSBLS12381 - randZr(&sk1.scalar) + randZr(sk1.scalar) // 2nd scalar (wrapped in a private key) err = seedRelic(seed) require.Nil(t, err) var sk2 prKeyBLSBLS12381 - randZr(&sk2.scalar) + randZr(sk2.scalar) // compare the 2 scalars (by comparing the private keys) assert.True(t, sk1.Equals(&sk2), "private keys should be equal") -} +}*/ // G1 and G2 scalar multiplication func BenchmarkScalarMultG1G2(b *testing.B) { - blsInstance.reInit() + seed := make([]byte, securityBits/8) _, _ = rand.Read(seed) _ = seedRelic(seed) var expo scalar - randZr(&expo) + randZr(&expo) // TODO: upadate // G1 generator multiplication b.Run("G1 gen", func(b *testing.B) { @@ -122,7 +122,7 @@ func TestMapToG1(t *testing.T) { // Hashing to G1 bench func BenchmarkMapToG1(b *testing.B) { - blsInstance.reInit() + input := make([]byte, expandMsgOutput) for i := 0; i < len(input); i++ { input[i] = byte(i) @@ -136,7 +136,7 @@ func BenchmarkMapToG1(b *testing.B) { // test subgroup membership check in G1 and G2 func TestSubgroupCheck(t *testing.T) { - blsInstance.reInit() + // seed Relic PRG seed := make([]byte, securityBits/8) _, _ = rand.Read(seed) @@ -165,7 +165,6 @@ func TestSubgroupCheck(t *testing.T) { // subgroup membership check bench func BenchmarkSubgroupCheck(b *testing.B) { - blsInstance.reInit() b.Run("G1", func(b *testing.B) { var p pointG1 diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 7cb8a04aef6..cdfc6aaf7f1 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -21,7 +21,7 @@ int get_sk_len() { // checks an input scalar a satisfies 0 < a < r // where (r) is the order of G1/G2 -int check_membership_Zr_star(const bn_t a){ +int check_membership_Fr_star(const bn_t a){ if (bn_cmp(a, &core_get()->ep_r) != RLC_LT || bn_cmp_dig(a, 0) != RLC_GT) { return INVALID; } @@ -68,9 +68,10 @@ int check_membership_G2(const ep2_t p){ } // Computes a BLS signature from a G1 point -static void bls_sign_ep(byte* s, const bn_t sk, const ep_t h) { +static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) { ep_t p; ep_new(p); + // s = h^sk ep_mult(p, h, sk); ep_write_bin_compact(s, p, SIGNATURE_LEN); @@ -78,7 +79,7 @@ static void bls_sign_ep(byte* s, const bn_t sk, const ep_t h) { } // Computes a BLS signature from a hash -void bls_sign(byte* s, const bn_t sk, const byte* data, const int len) { +void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { ep_t h; ep_new(h); // hash to G1 diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 016845719e1..325203479b2 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -35,7 +35,7 @@ int get_signature_len(); int get_pk_len(); int get_sk_len(); -void bls_sign(byte*, const bn_t, const byte*, const int); +void bls_sign(byte*, const Fr*, const byte*, const int); int bls_verify(const ep2_t, const byte*, const byte*, const int); int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, const uint32_t*, const ep2_st*); diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index a915bed4a64..297e61267d9 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -5,9 +5,12 @@ package crypto import ( "errors" - "fmt" - "github.com/onflow/flow-go/crypto/hash" + _ "errors" + + _ "fmt" + + _ "github.com/onflow/flow-go/crypto/hash" ) // BLS multi-signature using BLS12-381 curve @@ -38,6 +41,7 @@ import "C" // used for signatures. var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite) +/* // BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key. // // The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used @@ -92,8 +96,8 @@ func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) { // - (nil, error) if an unexpected error occurs // - (aggregated_signature, nil) otherwise func AggregateBLSSignatures(sigs []Signature) (Signature, error) { - // set BLS context - blsInstance.reInit() + + // check for empty list if len(sigs) == 0 { @@ -139,8 +143,8 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) { // - (nil, blsAggregateEmptyListError) if no keys are provided (input slice is empty) // - (aggregated_key, nil) otherwise func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { - // set BLS context - blsInstance.reInit() + + // check for empty list if len(keys) == 0 { @@ -157,8 +161,7 @@ func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { } var sum scalar - C.bn_new_wrapper((*C.bn_st)(&sum)) - C.bn_sum_vector((*C.bn_st)(&sum), (*C.bn_st)(&scalars[0]), + C.Fr_sum_vector((*C.Fr)(&sum), (*C.Fr)(&scalars[0]), (C.int)(len(scalars))) return newPrKeyBLSBLS12381(&sum), nil } @@ -177,8 +180,8 @@ func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { // - (nil, blsAggregateEmptyListError) no keys are provided (input slice is empty) // - (aggregated_key, nil) otherwise func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { - // set BLS context - blsInstance.reInit() + + // check for empty list if len(keys) == 0 { @@ -200,13 +203,12 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { sumKey := newPubKeyBLSBLS12381(&sum) return sumKey, nil -} +}*/ // IdentityBLSPublicKey returns an identity public key which corresponds to the point // at infinity in G2 (identity element of G2). +// TODO: return a constant key instead of a newly allocated one func IdentityBLSPublicKey() PublicKey { - // set BLS context - blsInstance.reInit() identity := *newPubKeyBLSBLS12381(nil) // set the point to infinity @@ -215,6 +217,8 @@ func IdentityBLSPublicKey() PublicKey { return &identity } +/* + // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key. // // The common use case assumes the aggregated public key was initially formed using @@ -230,8 +234,8 @@ func IdentityBLSPublicKey() PublicKey { // - (nil, notBLSKeyError) if at least one input key is not of type BLS BLS12-381 // - (remaining_key, nil) otherwise func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) { - // set BLS context - blsInstance.reInit() + + aggPKBLS, ok := aggKey.(*pubKeyBLSBLS12381) if !ok { @@ -330,8 +334,8 @@ func VerifyBLSSignatureOneMessage( func VerifyBLSSignatureManyMessages( pks []PublicKey, s Signature, messages [][]byte, kmac []hash.Hasher, ) (bool, error) { - // set BLS context - blsInstance.reInit() + + // check signature length if len(s) != signatureLengthBLSBLS12381 { @@ -479,8 +483,8 @@ func VerifyBLSSignatureManyMessages( func BatchVerifyBLSSignaturesOneMessage( pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher, ) ([]bool, error) { - // set BLS context - blsInstance.reInit() + + // empty list check if len(pks) == 0 { @@ -545,6 +549,7 @@ func BatchVerifyBLSSignaturesOneMessage( return verifBool, nil } +*/ // blsAggregateEmptyListError is returned when a list of BLS objects (e.g. signatures or keys) // is empty or nil and thereby represents an invalid input. diff --git a/crypto/bls_test.go b/crypto/bls_test.go index d0dc73c066c..df0afe1e96d 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -7,9 +7,9 @@ import ( "crypto/rand" "encoding/hex" "fmt" - mrand "math/rand" + _ "math/rand" "testing" - "time" + _ "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -131,7 +131,7 @@ func TestBLSBLS12381Hasher(t *testing.T) { assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16) }) - t.Run("orthogonal PoP and signature hashing", func(t *testing.T) { + /*t.Run("orthogonal PoP and signature hashing", func(t *testing.T) { data := []byte("random_data") // empty tag hasher sigKmac := NewExpandMsgXOFKMAC128("") @@ -140,7 +140,7 @@ func TestBLSBLS12381Hasher(t *testing.T) { // PoP hasher h2 := popKMAC.ComputeHash(data) assert.NotEqual(t, h1, h2) - }) + })*/ } @@ -216,7 +216,7 @@ func TestBLSUtils(t *testing.T) { } // BLS Proof of Possession test -func TestBLSPOP(t *testing.T) { +/*func TestBLSPOP(t *testing.T) { r := time.Now().UnixNano() mrand.Seed(r) t.Logf("math rand seed is %d", r) @@ -268,6 +268,8 @@ func TestBLSPOP(t *testing.T) { }) } + + // BLS multi-signature // signature aggregation sanity check // @@ -934,7 +936,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { assert.False(t, valid, "verification should fail with nil hasher") inputPks[0] = tmpPK }) -} +}*/ // TestBLSErrorTypes verifies working of error-type-detecting functions // such as `IsInvalidInputsError`. @@ -962,6 +964,7 @@ func TestBLSErrorTypes(t *testing.T) { }) } +/* // VerifyBLSSignatureManyMessages bench // Bench the slowest case where all messages and public keys are distinct. // (2*n) pairings without aggrgetion Vs (n+1) pairings with aggregation. @@ -1057,6 +1060,7 @@ func BenchmarkAggregate(b *testing.B) { }) } + func TestBLSIdentity(t *testing.T) { r := time.Now().UnixNano() mrand.Seed(r) @@ -1109,4 +1113,4 @@ func TestBLSIdentity(t *testing.T) { assert.NoError(t, err) assert.False(t, valid) }) -} +}*/ diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 4aa73278d3a..df3da1a108d 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -5,7 +5,7 @@ package crypto // #cgo CFLAGS: // #include "bls_thresholdsign_include.h" -import "C" +/*import "C" import ( "fmt" @@ -412,8 +412,8 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat signers = append(signers, index) } - // set BLS settings - blsInstance.reInit() + + // Lagrange Interpolate at point 0 result := C.G1_lagrangeInterpolateAtZero( @@ -456,8 +456,8 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat // are considered to reconstruct the signature. func BLSReconstructThresholdSignature(size int, threshold int, shares []Signature, signers []int) (Signature, error) { - // set BLS settings - blsInstance.reInit() + + if size < ThresholdSignMinSize || size > ThresholdSignMaxSize { return nil, invalidInputsErrorf( @@ -558,8 +558,8 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, threshold) } - // set BLS settings - blsInstance.reInit() + + // the scalars x and G2 points y x := make([]scalar, size) @@ -570,7 +570,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, if err := seedRelic(seed); err != nil { return nil, nil, nil, fmt.Errorf("seeding relic failed: %w", err) } - // Generate a polynomial P in Zr[X] of degree t + // Generate a polynomial P in Fr[X] of degree t a := make([]scalar, threshold+1) randZrStar(&a[0]) // non-identity key if threshold > 0 { @@ -581,10 +581,10 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, } // compute the shares for i := index(1); int(i) <= size; i++ { - C.Zr_polynomialImage( - (*C.bn_st)(&x[i-1]), + C.Fr_polynomialImage( + (*C.Fr)(&x[i-1]), (*C.ep2_st)(&y[i-1]), - (*C.bn_st)(&a[0]), (C.int)(len(a)), + (*C.Fr)(&a[0]), (C.int)(len(a)), (C.uint8_t)(i), ) } @@ -604,4 +604,4 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, // are sampled uniformly at random. The probability of // generating an identity key is therefore negligible. return skShares, pkShares, pkGroup, nil -} +}*/ diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index dc57355df47..94a12a024d7 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -9,7 +9,7 @@ static void Zr_lagrangeCoefficientAtZero(bn_t res, const int i, const uint8_t* s bn_t r, r_2; bn_new(r); g2_get_ord(r); - // (r-2) is needed to compute the inverse in Zr + // (r-2) is needed to compute the inverse in Fr // using little Fermat theorem bn_new(r_2); bn_sub_dig(r_2, r, 2); diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index 7471e1a0a3d..9b3a700fc96 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -10,6 +10,6 @@ #define MAX_IND_LOOPS 32 int G1_lagrangeInterpolateAtZero(byte*, const byte* , const uint8_t*, const int); -extern void Zr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x); +extern void Fr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x); #endif diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index cc9be81eeaf..7f0802d57b9 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -3,6 +3,7 @@ package crypto +/* import ( "crypto/rand" "fmt" @@ -615,3 +616,4 @@ func BenchmarkSimpleKeyGen(b *testing.B) { } b.StopTimer() } +*/ diff --git a/crypto/blst_include.h b/crypto/blst_include.h index dde3acd5f05..2721edcd97a 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -6,6 +6,15 @@ // blst related definitions // eventually this file would replace blst.h -#include "blst.h" +#include "blst.h" // TODO: should be deleted +#include "point.h" +#include "consts.h" + +// field elements F_r +typedef struct {limb_t limbs[4];} Fr; // also used as vec256; +// Subroup G1 in E1 +typedef POINTonE1 G1; +// Subroup G1 in E2 +typedef POINTonE2 G2; #endif \ No newline at end of file diff --git a/crypto/dkg.go b/crypto/dkg.go index 6e74f3d54a5..3e369b77fa4 100644 --- a/crypto/dkg.go +++ b/crypto/dkg.go @@ -1,5 +1,6 @@ package crypto +/* import ( "errors" "fmt" @@ -22,7 +23,7 @@ import ( // Flow uses DKG with the value t = floor((n-1)/2) to optimize for unforgeability and robustness // of the threshold signature scheme using the output keys. // -// Private keys are scalar in Zr, where r is the group order of G1/G2. +// Private keys are scalar in Fr, where r is the group order of G1/G2. // Public keys are in G2. const ( @@ -234,4 +235,4 @@ type DKGProcessor interface { // do so, the protocol can be broken. // log describes the misbehavior. FlagMisbehavior(participant int, log string) -} +}*/ diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 3a2bce01559..50923ee9087 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -11,10 +11,10 @@ // r being the order of G1 // writes P(x) in out and P(x).g2 in y if y is non NULL // x being a small integer -void Zr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){ +void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){ bn_t image; bn_new(image); - Zr_polynomialImage(image, y, a, a_size, x); + Fr_polynomialImage(image, y, a, a_size, x); // exports the result const int out_size = Fr_BYTES; bn_write_bin(out, out_size, image); @@ -25,7 +25,7 @@ void Zr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_s // r being the order of G1 // writes P(x) in out and P(x).g2 in y if y is non NULL // x being a small integer -void Zr_polynomialImage(bn_t image, ep2_t y, const bn_st *a, const int a_size, const byte x){ +void Fr_polynomialImage(bn_t image, ep2_t y, const bn_st *a, const int a_size, const byte x){ bn_t r; bn_new(r); g2_get_ord(r); diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 175fe5acb0a..76a5aebcd49 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -3,6 +3,7 @@ package crypto +/* // #cgo CFLAGS: // #include "dkg_include.h" import "C" @@ -21,7 +22,7 @@ import ( // partcipants including itself. The particpants validate their shares // using a public verifiaction vector shared by the . -// Private keys are scalar in Zr, where r is the group order of G1/G2 +// Private keys are scalar in Fr, where r is the group order of G1/G2 // Public keys are in G2. // feldman VSS protocol, implements DKGState @@ -30,7 +31,7 @@ type feldmanVSSstate struct { *dkgCommon // participant index dealerIndex index - // Polynomial P = a_0 + a_1*x + .. + a_t*x^t in Zr[X], the vector size is (t+1) + // Polynomial P = a_0 + a_1*x + .. + a_t*x^t in Fr[X], the vector size is (t+1) // a_0 is the group private key a []scalar // Public vector of the group, the vector size is (t+1) @@ -77,12 +78,12 @@ func NewFeldmanVSS(size int, threshold int, myIndex int, func (s *feldmanVSSstate) init() { // set the bls context - blsInstance.reInit() + s.running = false s.y = nil s.xReceived = false s.vAReceived = false - C.bn_new_wrapper((*C.bn_st)(&s.x)) + C.bn_new_wrapper((*C.Fr)(&s.x)) } // Start triggers the protocol start for the current participant. @@ -264,7 +265,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error { return fmt.Errorf("generating shares failed: %w", err) } - // Generate a polyomial P in Zr[X] of degree t + // Generate a polyomial P in Fr[X] of degree t s.a = make([]scalar, s.threshold+1) s.vA = make([]pointG2, s.threshold+1) s.y = make([]pointG2, s.size) @@ -273,7 +274,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error { generatorScalarMultG2(&s.vA[0], &s.a[0]) if s.threshold > 0 { for i := 1; i < s.threshold; i++ { - C.bn_new_wrapper((*C.bn_st)(&s.a[i])) + C.bn_new_wrapper((*C.Fr)(&s.a[i])) randZr(&s.a[i]) generatorScalarMultG2(&s.vA[i], &s.a[i]) } @@ -288,7 +289,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error { if i-1 == s.myIndex { xdata := make([]byte, shareSize) zrPolynomialImage(xdata, s.a, i, &s.y[i-1]) - C.bn_read_bin((*C.bn_st)(&s.x), + C.bn_read_bin((*C.Fr)(&s.x), (*C.uchar)(&xdata[0]), PrKeyLenBLSBLS12381, ) @@ -350,7 +351,7 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) { } // read the participant private share - if C.bn_read_Zr_bin((*C.bn_st)(&s.x), + if C.Fr_read_bytes((*C.Fr)(&s.x), (*C.uchar)(&data[0]), PrKeyLenBLSBLS12381, ) != valid { @@ -405,14 +406,14 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { } } -// zrPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Z/Zr +// zrPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Z/Fr // r being the order of G1 // P(x) is written in dest, while g2^P(x) is written in y // x being a small integer func zrPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) { - C.Zr_polynomialImage_export((*C.uchar)(&dest[0]), + C.Fr_polynomialImage_export((*C.uchar)(&dest[0]), (*C.ep2_st)(y), - (*C.bn_st)(&a[0]), (C.int)(len(a)), + (*C.Fr)(&a[0]), (C.int)(len(a)), (C.uint8_t)(x), ) } @@ -441,7 +442,7 @@ func readVerifVector(A []pointG2, src []byte) error { func (s *feldmanVSSstate) verifyShare() bool { // check y[current] == x.G2 - return C.verifyshare((*C.bn_st)(&s.x), + return C.verifyshare((*C.Fr)(&s.x), (*C.ep2_st)(&s.y[s.myIndex])) == 1 } @@ -455,3 +456,4 @@ func (s *feldmanVSSstate) computePublicKeys() { (*C.ep2_st)(&s.vA[0]), (C.int)(len(s.vA)), ) } +*/ diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index ecb26f7d6e3..76f343256a4 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -3,6 +3,7 @@ package crypto +/* // #cgo CFLAGS: // #include "dkg_include.h" import "C" @@ -27,7 +28,7 @@ import ( // a complaint answer. The protocol ends with all honest participants // reaching a consensus about the dealer qualification/disqualification. -// Private keys are scalar in Zr, where r is the group order of G1/G2 +// Private keys are scalar in Fr, where r is the group order of G1/G2 // Public keys are in G2. // feldman VSS protocol, with complaint mechanism, implements DKGState @@ -402,7 +403,7 @@ func (s *feldmanVSSQualState) receiveShare(origin index, data []byte) { return } // read the participant private share - if C.bn_read_Zr_bin((*C.bn_st)(&s.x), + if C.Fr_read_bytes((*C.Fr)(&s.x), (*C.uchar)(&data[0]), PrKeyLenBLSBLS12381, ) != valid { @@ -507,7 +508,7 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index) // - true if the complaint answer is not correct func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool { // check y[complainer] == share.G2 - return C.verifyshare((*C.bn_st)(&c.answer), + return C.verifyshare((*C.Fr)(&c.answer), (*C.ep2_st)(&s.y[complainer])) == 0 } @@ -624,8 +625,8 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) } // read the complainer private share - C.bn_new_wrapper((*C.bn_st)(&s.complaints[complainer].answer)) - if C.bn_read_Zr_bin((*C.bn_st)(&s.complaints[complainer].answer), + C.bn_new_wrapper((*C.Fr)(&s.complaints[complainer].answer)) + if C.Fr_read_bytes((*C.Fr)(&s.complaints[complainer].answer), (*C.uchar)(&data[1]), PrKeyLenBLSBLS12381, ) != valid { @@ -648,8 +649,8 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) // flag check is a sanity check if c.received { // read the complainer private share - C.bn_new_wrapper((*C.bn_st)(&c.answer)) - if C.bn_read_Zr_bin((*C.bn_st)(&c.answer), + C.bn_new_wrapper((*C.Fr)(&c.answer)) + if C.Fr_read_bytes((*C.Fr)(&c.answer), (*C.uchar)(&data[1]), PrKeyLenBLSBLS12381, ) != valid { @@ -672,4 +673,4 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) s.x = c.answer } } -} +}*/ diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index 5e518300071..f50b143961d 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -9,8 +9,8 @@ #define MAX_IND 255 #define MAX_IND_BITS 8 -void Zr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x); -void Zr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x); +void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x); +void Fr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x); void G2_polynomialImages(ep2_st* y, const int len_y, const ep2_st* A, const int len_A); void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len); int ep2_vector_read_bin(ep2_st* A, const byte* src, const int len); diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index d79379f7d83..51733e803fb 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -3,6 +3,7 @@ package crypto +/* // #cgo CFLAGS: // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "dkg_include.h" @@ -34,7 +35,7 @@ import ( // from the protocol, and the overall key is taking into account // all chunks from qualified dealers. -// Private keys are scalar in Zr, where r is the group order of G1/G2 +// Private keys are scalar in Fr, where r is the group order of G1/G2 // Public keys are in G2. // Joint Feldman protocol, with complaint mechanism, implements DKGState @@ -202,7 +203,7 @@ func (s *JointFeldmanState) End() (PrivateKey, PublicKey, []PublicKey, error) { jointx, jointPublicKey, jointy := s.sumUpQualifiedKeys(s.size - disqualifiedTotal) // private key of the current participant - x := newPrKeyBLSBLS12381(jointx) + x := newPrKeyBLSBLS12381(&jointx) // Group public key Y := newPubKeyBLSBLS12381(jointPublicKey) @@ -303,8 +304,8 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2 // sum up x var jointx scalar - C.bn_new_wrapper((*C.bn_st)(&jointx)) - C.bn_sum_vector((*C.bn_st)(&jointx), (*C.bn_st)(&qualifiedx[0]), + C.bn_new_wrapper((*C.Fr)(&jointx)) + C.Fr_sum_vector((*C.Fr)(&jointx), (*C.Fr)(&qualifiedx[0]), (C.int)(qualified)) // sum up Y var jointPublicKey pointG2 @@ -338,4 +339,4 @@ func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointG2 } } return qualifiedx, qualifiedPubKey, qualifiedy -} +}*/ diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index d996ae0835c..104cb8ef56f 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -3,6 +3,7 @@ package crypto +/* import ( "fmt" mrand "math/rand" @@ -833,4 +834,4 @@ func TestDKGTransitionErrors(t *testing.T) { assert.True(t, IsDKGInvalidStateTransitionError(err)) } }) -} +}*/ diff --git a/crypto/thresholdsign.go b/crypto/thresholdsign.go index 2dae7061b76..ebb814dee5b 100644 --- a/crypto/thresholdsign.go +++ b/crypto/thresholdsign.go @@ -16,10 +16,10 @@ import ( // the input threshold value (t) should be set to t = floor((n-1)/2). const ( - // ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol - ThresholdSignMinSize = MinimumThreshold + 1 - // ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol - ThresholdSignMaxSize = DKGMaxSize +// ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol +// ThresholdSignMinSize = MinimumThreshold + 1 +// ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol +// ThresholdSignMaxSize = DKGMaxSize ) // ThresholdSignatureInspector is an inspector of the threshold signature protocol. From b99d75bc530c46b9b90ab523c386d6fab65749e7 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sat, 18 Feb 2023 20:36:07 -0600 Subject: [PATCH 011/200] use new type Fr in BLS simple sig --- crypto/bls.go | 5 +- crypto/bls12381_utils.c | 86 ++++++++++++----------------------- crypto/bls12381_utils.go | 46 +++++++++++++------ crypto/bls12381_utils.h | 33 +++++++------- crypto/bls12381_utils_test.go | 2 +- crypto/bls_include.h | 2 +- crypto/bls_test.go | 4 +- crypto/blst_include.h | 15 +++++- crypto/blst_src/README.md | 5 +- crypto/blst_tools.c | 50 ++++++++++++++++++++ crypto/dkg_feldmanvss.go | 2 - crypto/dkg_feldmanvssq.go | 2 - crypto/dkg_jointfeldman.go | 1 - 13 files changed, 150 insertions(+), 103 deletions(-) create mode 100644 crypto/blst_tools.c diff --git a/crypto/bls.go b/crypto/bls.go index e4b9d4825b6..8abee2c9200 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -61,7 +61,7 @@ const ( // // SignatureLenBLSBLS12381 is the size of G1 elements SignatureLenBLSBLS12381 = fieldSize * (2 - serializationG1) // the length is divided by 2 if compression is on - PrKeyLenBLSBLS12381 = 32 + PrKeyLenBLSBLS12381 = 32 // equal to frBytesLen // PubKeyLenBLSBLS12381 is the size of G2 elements PubKeyLenBLSBLS12381 = 2 * fieldSize * (2 - serializationG2) // the length is divided by 2 if compression is on @@ -271,7 +271,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { // L is the OKM length // L = ceil((3 * ceil(log2(r))) / 16) which makes L (security_bits/8)-larger than r size - okmLength := (3 * PrKeyLenBLSBLS12381) / 2 + okmLength := (3 * frBytesLen) / 2 // HKDF secret = IKM || I2OSP(0, 1) secret := make([]byte, len(ikm)+1) @@ -320,6 +320,7 @@ func BLSInvalidSignature() Signature { } // decodePrivateKey decodes a slice of bytes into a private key. +// Decoding assumes a bytes big endian format. // It checks the scalar is non-zero and is less than the group order. func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, error) { sk := newPrKeyBLSBLS12381(nil) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 28cf52f04a2..d71e583daf0 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -29,8 +29,8 @@ int get_invalid() { return INVALID; } -void bn_new_wrapper(bn_t a) { - bn_new(a); +int get_Fr_BYTES() { + return Fr_BYTES; } // global variable of the pre-computed data @@ -171,7 +171,15 @@ void bytes_print_(char* s, byte* data, int len) { printf("\n"); } -// DEBUG printing functions +void Fr_print_(char* s, Fr* a) { + printf("[%s]:\n", s); + limb_t* p = (limb_t*)(a) + Fr_DIGITS; + for (int i=0; iep_r);*/ -} - -// Reads a scalar from an array and maps it to Zr. -// The resulting scalar `a` satisfies 0 <= a < r. -// `len` must be less than BITS_TO_BYTES(RLC_BN_BITS). -// It returns VALID if scalar is zero and INVALID otherwise -int bn_map_to_Zr(Fr* a, const uint8_t* bin, int len) { - /*bn_t tmp; - bn_new(tmp); - bn_new_size(tmp, BYTES_TO_DIGITS(len)); - bn_read_bin(tmp, bin, len); - bn_mod(a, tmp, &core_get()->ep_r); - bn_rand(tmp, RLC_POS, len << 3); // overwrite tmp - bn_free(tmp); - if (bn_cmp_dig(a, 0) == RLC_EQ) { - return VALID; - } - return INVALID;*/ -} - -// Reads a scalar from an array and maps it to Zr*. -// The resulting scalar `a` satisfies 0 < a < r. -// `len` must be less than BITS_TO_BYTES(RLC_BN_BITS) -void bn_map_to_Zr_star(Fr* a, const uint8_t* bin, int len) { - /*bn_t tmp; - bn_new(tmp); - bn_new_size(tmp, BYTES_TO_DIGITS(len)); - bn_read_bin(tmp, bin, len); - bn_t r_1; - bn_new(r_1); - bn_sub_dig(r_1, &core_get()->ep_r, 1); - bn_mod_basic(a,tmp,r_1); - bn_add_dig(a,a,1); - bn_rand(tmp, RLC_POS, len << 3); // overwrite tmp - bn_free(tmp); - bn_free(r_1);*/ +// Reads a scalar from an array and maps it to Fr. +// It returns true if scalar is zero and false otherwise. +bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { + vec256_from_be_bytes((limb_t*)a, bin, len); + return Fr_is_zero(a); } // returns the sign of y. @@ -561,6 +522,8 @@ bool_t Fr_is_equal(const Fr* a, const Fr* b) { } // reads a scalar in `a` and checks it is a valid Fr element (a < r). +// input bytes are big endian. +// returns: // - BLST_BAD_ENCODING if the length is invalid // - BLST_BAD_SCALAR if the scalar isn't in Fr // - v if the scalar is valid @@ -568,15 +531,19 @@ BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) { if (len != Fr_BYTES) { return BLST_BAD_ENCODING; } - if (!check_mod_256(bin, BLS12_381_r)) { // check_mod_256 compares byte[] against a vec256! + pow256 tmp; + pow256_from_be_bytes(tmp, bin); + if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256! return BLST_BAD_SCALAR; } + vec_zero(tmp, Fr_BYTES); limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES); return BLST_SUCCESS; } // reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r). -// returns +// input bytes are big endian. +// returns: // - BLST_BAD_ENCODING if the length is invalid // - BLST_BAD_SCALAR if the scalar isn't in Fr_star // - BLST_SUCCESS if the scalar is valid @@ -592,6 +559,11 @@ BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) { return BLST_SUCCESS; } +// write Fr element `a` in big endian bytes. +void Fr_write_bytes(uint8_t *bin, const Fr* a) { + be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES); +} + // computes the sum of the array elements x and writes the sum in jointx // the sum is computed in Fr void Fr_sum_vector(Fr* jointx, const Fr* x, const int len) { diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 43e6a782291..d569bf0cc38 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -14,6 +14,7 @@ package crypto // #include "bls12381_utils.h" import "C" import ( + "crypto/rand" "errors" ) @@ -23,6 +24,9 @@ type pointG1 C.ep_st type pointG2 C.ep2_st type scalar C.Fr +// BLS12-381 related lengths +var frBytesLen = int(C.get_Fr_BYTES()) + // TODO: For now scalars are represented as field elements Fr since all scalars // are less than r - check if distinguishing two types in necessary //type pointG1_blst C.G1 @@ -117,33 +121,47 @@ func (p *pointG2) isInfinity() bool { return C.ep2_is_infty((*C.ep2_st)(p)) == 1 } -// returns a random number in Fr -func randZr(x *scalar) { - //C.bn_randZr((*C.Fr)(x)) +// returns a random element of Fr in input pointer +func randZr(x *scalar) error { + bytes := make([]byte, frBytesLen+securityBits/8) + _, err := rand.Read(bytes) // checking one output is enough + if err != nil { + return errors.New("internal rng failed") + } + _ = mapToZr(x, bytes) + return nil } -// returns a random non-zero number in Fr -func randZrStar(x *scalar) { - //C.bn_randZr_star((*C.Fr)(x)) +// writes a random element of Fr* in input pointer +func randZrStar(x *scalar) error { + bytes := make([]byte, frBytesLen+securityBits/8) + isZero := true + for isZero { + _, err := rand.Read(bytes) // checking one output is enough + if err != nil { + return errors.New("internal rng failed") + } + isZero = mapToZr(x, bytes) + } + return nil } // mapToZr reads a scalar from a slice of bytes and maps it to Zr. -// The resulting scalar `k` satisfies 0 <= k < r. +// The resulting element `k` therefore satisfies 0 <= k < r. // It returns true if scalar is zero and false otherwise. func mapToZr(x *scalar, src []byte) bool { - isZero := C.bn_map_to_Zr((*C.Fr)(x), + isZero := C.map_bytes_to_Fr((*C.Fr)(x), (*C.uchar)(&src[0]), (C.int)(len(src))) - return isZero == valid + if isZero { + return true + } + return false } // writeScalar writes a G2 point in a slice of bytes func writeScalar(dest []byte, x *scalar) { - /*C.bn_write_bin((*C.uchar)(&dest[0]), - (C.int)(prKeyLengthBLSBLS12381), - (*C.Fr)(x), - )*/ - // TODO: to fill + C.Fr_write_bytes((*C.uchar)(&dest[0]), (*C.Fr)(x)) } // writePointG2 writes a G2 point in a slice of bytes diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 7b5ec0508f0..8d5a8ed0a6e 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -81,6 +81,10 @@ typedef struct prec_ { // TODO: to delete when Relic is removed bn_st* Fr_blst_to_relic(const Fr* x); +int get_valid(); +int get_invalid(); +int get_Fr_BYTES(); + // BLS based SPoCK int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*); @@ -88,25 +92,24 @@ int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*); void map_to_G1(ep_t, const byte*, const int); // Fr utilities -bool_t Fr_is_zero(const Fr* a); -bool_t Fr_is_equal(const Fr* a, const Fr* b); +bool_t Fr_is_zero(const Fr* a); +bool_t Fr_is_equal(const Fr* a, const Fr* b); +BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len); +BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len); +void Fr_write_bytes(uint8_t *bin, const Fr* a); +bool map_bytes_to_Fr(Fr*, const uint8_t*, int); // Utility functions -int get_valid(); -int get_invalid(); -void bn_new_wrapper(bn_t a); - ctx_t* relic_init_BLS12_381(); prec_st* init_precomputed_data_BLS12_381(); void precomputed_data_set(const prec_st* p); void seed_relic(byte*, int); -int ep_read_bin_compact(ep_t, const byte *, const int); -void ep_write_bin_compact(byte *, const ep_t, const int); -int ep2_read_bin_compact(ep2_t, const byte *, const int); -void ep2_write_bin_compact(byte *, const ep2_t, const int); -BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len); -BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len); +int ep_read_bin_compact(ep_t, const byte *, const int); +void ep_write_bin_compact(byte *, const ep_t, const int); +int ep2_read_bin_compact(ep2_t, const byte *, const int); +void ep2_write_bin_compact(byte *, const ep2_t, const int); + @@ -116,11 +119,6 @@ void ep_mult(ep_t, const ep_t, const Fr*); void ep2_mult_gen(ep2_t, const Fr*); void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); -void bn_randZr(Fr*); -void bn_randZr_star(Fr*); -int bn_map_to_Zr(Fr*, const uint8_t*, int); -void bn_map_to_Zr_star(Fr*, const uint8_t*, int); - void Fr_sum_vector(Fr*, const Fr*, const int); void ep_sum_vector(ep_t, ep_st*, const int); void ep2_sum_vector(ep2_t, ep2_st*, const int); @@ -147,6 +145,7 @@ void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int); // Debugging related functions void bytes_print_(char*, byte*, int); +void Fr_print_(char*, Fr*); void fp_print_(char*, fp_t); void bn_print_(char*, bn_st*); void ep_print_(char*, ep_st*); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index ac5fc6ecc93..877eff219e3 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -54,7 +54,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) { _, _ = rand.Read(seed) _ = seedRelic(seed) var expo scalar - randZr(&expo) // TODO: upadate + randZr(&expo) // G1 generator multiplication b.Run("G1 gen", func(b *testing.B) { diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 325203479b2..0e965bac88e 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -8,7 +8,7 @@ #include "relic.h" #include "bls12381_utils.h" -// Signature, Public key and Private key lengths +// Signature, Public key and Private key lengths #define FULL_SIGNATURE_LEN G1_BYTES #define FULL_PK_LEN G2_BYTES #define SIGNATURE_LEN (FULL_SIGNATURE_LEN/(G1_SERIALIZATION+1)) diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 579700a183e..5e4a13564bd 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -151,7 +151,7 @@ func TestBLSEncodeDecode(t *testing.T) { // specific tests for BLS // zero private key - skBytes := make([]byte, PrKeyLenBLSBLS12381) + /*skBytes := make([]byte, PrKeyLenBLSBLS12381) sk, err := DecodePrivateKey(BLSBLS12381, skBytes) require.Error(t, err, "decoding identity private key should fail") assert.True(t, IsInvalidInputsError(err)) @@ -195,7 +195,7 @@ func TestBLSEncodeDecode(t *testing.T) { invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D") require.NoError(t, err) _, err = DecodePublicKey(BLSBLS12381, invalidPk2) - assert.Error(t, err) + assert.Error(t, err)*/ } // TestBLSEquals tests equal for BLS keys diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 2721edcd97a..0733bda0b30 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -3,18 +3,29 @@ #ifndef __BLST_INCLUDE_H__ #define __BLST_INCLUDE_H__ -// blst related definitions +// extra tools to use BLST low level that are needed by the Flow crypto library // eventually this file would replace blst.h #include "blst.h" // TODO: should be deleted #include "point.h" #include "consts.h" +// types used by the Flow crypto library that are imported from BLST +// these type definitions are used as an abstraction from BLST internal types + // field elements F_r -typedef struct {limb_t limbs[4];} Fr; // also used as vec256; +typedef struct {limb_t limbs[4];} Fr; // also used as vec256 (little endian limbs) // Subroup G1 in E1 typedef POINTonE1 G1; // Subroup G1 in E2 typedef POINTonE2 G2; + +// extra functions and tools that are needed by the Flow crypto library +// and that are not exported in the desired form by BLST + +void pow256_from_be_bytes(pow256 ret, const unsigned char a[32]); +void vec256_from_be_bytes(vec256 out, const unsigned char *bytes, size_t n); + + #endif \ No newline at end of file diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index 12bc7b863ca..877c9db7ee5 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -7,10 +7,11 @@ specifically from the commit <92c12ac58095de04e776cec5ef5ce5bdf242b693>. While BLST exports multiple functions and tools, the implementation in Flow crypto requires access to low level functions. Some of these tools are not exported by BLST, others would need to be used without paying for the cgo cost, and therefore without using the Go bindings in BLST. - The folder contains: - BLST LICENSE file - all /src/*.c and /src/*.h files (C source files) - all /build (assembly generated files) - /bindings/blst.h (headers of external functions) -- /bindings/blst_aux.h (headers of external aux functions) \ No newline at end of file +- /bindings/blst_aux.h (headers of external aux functions) + +TODO: add steps for upgrading the BLST version \ No newline at end of file diff --git a/crypto/blst_tools.c b/crypto/blst_tools.c new file mode 100644 index 00000000000..dcc1b1171a4 --- /dev/null +++ b/crypto/blst_tools.c @@ -0,0 +1,50 @@ +// +build relic + +// extra tools to use BLST low level that are needed by the Flow crypto library + +#include "blst_include.h" +#include "bls12381_utils.h" + +// internal type of BLST `pow256` uses bytes little endian. +// input is bytes big endian as used by Flow crypto lib external scalars. +void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES]) +{ + unsigned char* b = (unsigned char*)a + Fr_BYTES - 1; + if ((uptr_t)ret == (uptr_t)a) { // swap in place + for (int i=0; i 32) { + limbs_from_be_bytes(digit, bytes -= 32, 32); + from_mont_256(digit, digit, BLS12_381_r, r0); + mul_mont_sparse_256(digit, digit, radix, BLS12_381_r, r0); + add_mod_256(out, out, digit, BLS12_381_r); + mul_mont_sparse_256(radix, radix, BLS12_381_rRR, BLS12_381_r, r0); + n -= 32; + } + limbs_from_be_bytes(digit, bytes -= n, n); + from_mont_256(digit, digit, BLS12_381_r, r0); + mul_mont_sparse_256(digit, digit, radix, BLS12_381_r, r0); + add_mod_256(out, out, digit, BLS12_381_r); + + vec_zero(digit, sizeof(digit)); +} \ No newline at end of file diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 76a5aebcd49..221253168cd 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -83,7 +83,6 @@ func (s *feldmanVSSstate) init() { s.y = nil s.xReceived = false s.vAReceived = false - C.bn_new_wrapper((*C.Fr)(&s.x)) } // Start triggers the protocol start for the current participant. @@ -274,7 +273,6 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error { generatorScalarMultG2(&s.vA[0], &s.a[0]) if s.threshold > 0 { for i := 1; i < s.threshold; i++ { - C.bn_new_wrapper((*C.Fr)(&s.a[i])) randZr(&s.a[i]) generatorScalarMultG2(&s.vA[i], &s.a[i]) } diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 76f343256a4..8a92cd5dff3 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -625,7 +625,6 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) } // read the complainer private share - C.bn_new_wrapper((*C.Fr)(&s.complaints[complainer].answer)) if C.Fr_read_bytes((*C.Fr)(&s.complaints[complainer].answer), (*C.uchar)(&data[1]), PrKeyLenBLSBLS12381, @@ -649,7 +648,6 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) // flag check is a sanity check if c.received { // read the complainer private share - C.bn_new_wrapper((*C.Fr)(&c.answer)) if C.Fr_read_bytes((*C.Fr)(&c.answer), (*C.uchar)(&data[1]), PrKeyLenBLSBLS12381, diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index 51733e803fb..be8b2c9f70f 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -304,7 +304,6 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2 // sum up x var jointx scalar - C.bn_new_wrapper((*C.Fr)(&jointx)) C.Fr_sum_vector((*C.Fr)(&jointx), (*C.Fr)(&qualifiedx[0]), (C.int)(qualified)) // sum up Y From e3f4fee85672701dfa6dc84d253cb7ecdf6d974a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sat, 25 Feb 2023 16:31:42 -0600 Subject: [PATCH 012/200] implement BLS threshold signature with new Fr type --- crypto/bls.go | 2 +- crypto/bls12381_hashtocurve.c | 8 + crypto/bls12381_utils.c | 247 +++++++++++++++++++---------- crypto/bls12381_utils.h | 14 +- crypto/bls12381_utils_test.go | 51 +----- crypto/bls_core.c | 22 +-- crypto/bls_test.go | 4 +- crypto/bls_thresholdsign.go | 20 +-- crypto/bls_thresholdsign_core.c | 222 +++++++++++++++----------- crypto/bls_thresholdsign_include.h | 8 +- crypto/bls_thresholdsign_test.go | 20 +-- crypto/blst_include.h | 8 +- crypto/blst_tools.c | 26 --- crypto/dkg.go | 3 +- crypto/dkg_core.c | 56 +++---- crypto/dkg_feldmanvssq.go | 3 +- crypto/dkg_include.h | 6 +- crypto/dkg_test.go | 17 +- crypto/thresholdsign.go | 8 +- 19 files changed, 393 insertions(+), 352 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 8abee2c9200..48996e0ae9d 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -222,7 +222,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) case valid: return true, nil default: - return false, fmt.Errorf("signature verification failed") + return false, fmt.Errorf("signature verification failed: code %d", verif) } } diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c index 229f9c009de..62053d7ed22 100644 --- a/crypto/bls12381_hashtocurve.c +++ b/crypto/bls12381_hashtocurve.c @@ -335,4 +335,12 @@ void map_to_G1(ep_t h, const byte* data, const int len) { #elif hashToPoint==RELIC_SSWU ep_map_from_field(h, data, len); #endif + + /*Fr a, b; + Fr_set_limb(&a, 1); + Fr_print_("a", &a); + Fr_inv_montg_eucl(&b,&a); + Fr_print_("b", &b); + Fr_from_montg(&b, &b); + Fr_print_("b", &b); */ } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index d71e583daf0..c8dfb808827 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -8,6 +8,23 @@ #include "bls_include.h" #include "assert.h" +// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) + +// return macro values to the upper Go Layer +int get_valid() { + return VALID; +} + +int get_invalid() { + return INVALID; +} + +int get_Fr_BYTES() { + return Fr_BYTES; +} + +// Fr utilities + // TODO: temp utility function to delete bn_st* Fr_blst_to_relic(const Fr* x) { bn_st* out = (bn_st*)malloc(sizeof(bn_st)); @@ -18,19 +35,158 @@ bn_st* Fr_blst_to_relic(const Fr* x) { return out; } -// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) +// returns true if a == 0 and false otherwise +bool_t Fr_is_zero(const Fr* a) { + return bytes_are_zero((const byte*)a, Fr_BYTES); +} -// return macro values to the upper Go Layer -int get_valid() { - return VALID; +// returns true if a == b and false otherwise +bool_t Fr_is_equal(const Fr* a, const Fr* b) { + return vec_is_equal(a, b, Fr_BYTES); } -int get_invalid() { - return INVALID; +// sets `a` to limb `l` +void Fr_set_limb(Fr* a, const limb_t l){ + vec_zero((byte*)a + sizeof(limb_t), Fr_BYTES - sizeof(limb_t)); + *((limb_t*)a) = l; } -int get_Fr_BYTES() { - return Fr_BYTES; +void Fr_copy(Fr* res, Fr* a) { + vec_copy((byte*)res, (byte*)a, Fr_BYTES); +} + +// sets `a` to 0 +void Fr_set_zero(Fr* a){ + vec_zero((byte*)a, Fr_BYTES); +} + +void Fr_add(Fr *res, const Fr *a, const Fr *b) { + add_mod_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r); +} + +void Fr_sub(Fr *res, const Fr *a, const Fr *b) { + sub_mod_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r); +} + +void Fr_neg(Fr *res, const Fr *a) { + cneg_mod_256((limb_t*)res, (limb_t*)a, 1, BLS12_381_r); +} + +void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b) { + mul_mont_sparse_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r, r0); +} + +void Fr_to_montg(Fr *res, const Fr *a) { + mul_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_rRR, BLS12_381_r, r0); +} + +void Fr_from_montg(Fr *res, const Fr *a) { + from_mont_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0); +} + +// result is in Montgomery form +// res = a^(-1)*R +void Fr_inv_montg_eucl(Fr *res, const Fr *a) { + // copied and modified from BLST code + // Copyright Supranational LLC + static const vec256 rx2 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), + TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), + }; + vec512 temp; + ct_inverse_mod_256(temp, (limb_t*)a, BLS12_381_r, rx2); + redc_mont_256((limb_t*)res, temp, BLS12_381_r, r0); +} + +void Fr_inv_montg_expo(Fr *res, const Fr *a) { + // TODO: +} + +// computes the sum of the array elements and writes the sum in jointx +void Fr_sum_vector(Fr* jointx, const Fr x[], const int len) { + Fr_set_zero(jointx); + for (int i=0; i Fr_BYTES) { + limbs_from_be_bytes((limb_t*)&digit, bytes -= Fr_BYTES, Fr_BYTES); // l_i + Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i (i is the loop number starting at 1) + Fr_add(out, out, &digit); + Fr_mul_montg(&radix, &radix, (Fr*)BLS12_381_rRR); // R^(i+1) + n -= Fr_BYTES; + } + Fr_set_zero(&digit); + limbs_from_be_bytes((limb_t*)&digit, bytes -= n, n); + Fr_mul_montg(&digit, &digit, &radix); + Fr_add(out, out, &digit); + // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n + // reduce the extra R + Fr_from_montg(out, out); + // clean up possible sensitive data + Fr_set_zero(&digit); +} + +// Reads a scalar from an array and maps it to Fr. +// It returns true if scalar is zero and false otherwise. +bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { + vec256_from_be_bytes(a, bin, len); + //Fr_set_limb(a, 1); TODO: delete + return Fr_is_zero(a); } // global variable of the pre-computed data @@ -160,7 +316,7 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) { void ep2_mult_gen(ep2_t res, const Fr* expo) { bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using precomputed table of size 4 - g2_mul_gen(res, (bn_st*)tmp_expo); + g2_mul_gen(res, tmp_expo); } // DEBUG printing functions @@ -204,13 +360,6 @@ void ep2_print_(char* s, ep2_st* p) { g2_print(p); } -// Reads a scalar from an array and maps it to Fr. -// It returns true if scalar is zero and false otherwise. -bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { - vec256_from_be_bytes((limb_t*)a, bin, len); - return Fr_is_zero(a); -} - // returns the sign of y. // 1 if y > (p - 1)/2 and 0 otherwise. static int fp_get_sign(const fp_t y) { @@ -513,72 +662,6 @@ int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) { return RLC_ERR; } -bool_t Fr_is_zero(const Fr* a) { - return bytes_are_zero((const byte*)a, Fr_BYTES); -} - -bool_t Fr_is_equal(const Fr* a, const Fr* b) { - return vec_is_equal(a, b, Fr_BYTES); -} - -// reads a scalar in `a` and checks it is a valid Fr element (a < r). -// input bytes are big endian. -// returns: -// - BLST_BAD_ENCODING if the length is invalid -// - BLST_BAD_SCALAR if the scalar isn't in Fr -// - v if the scalar is valid -BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) { - if (len != Fr_BYTES) { - return BLST_BAD_ENCODING; - } - pow256 tmp; - pow256_from_be_bytes(tmp, bin); - if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256! - return BLST_BAD_SCALAR; - } - vec_zero(tmp, Fr_BYTES); - limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES); - return BLST_SUCCESS; -} - -// reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r). -// input bytes are big endian. -// returns: -// - BLST_BAD_ENCODING if the length is invalid -// - BLST_BAD_SCALAR if the scalar isn't in Fr_star -// - BLST_SUCCESS if the scalar is valid -BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) { - int ret = Fr_read_bytes(a, bin, len); - if (ret != BLST_SUCCESS) { - return ret; - } - // check if a=0 - if (Fr_is_zero(a)) { - return BLST_BAD_SCALAR; - } - return BLST_SUCCESS; -} - -// write Fr element `a` in big endian bytes. -void Fr_write_bytes(uint8_t *bin, const Fr* a) { - be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES); -} - -// computes the sum of the array elements x and writes the sum in jointx -// the sum is computed in Fr -void Fr_sum_vector(Fr* jointx, const Fr* x, const int len) { - /*bn_t r; - bn_new(r); - g2_get_ord(r); - bn_set_dig(jointx, 0); - bn_new_size(jointx, BITS_TO_DIGITS(Fr_BITS+1)); - for (int i=0; iep_r) != RLC_LT || bn_cmp_dig(a, 0) != RLC_GT) { - return INVALID; - } - return VALID; -} - // Checks if input point p is in the subgroup G1. // The function assumes the input is known to be on the curve E1. int check_membership_G1(const ep_t p){ @@ -93,8 +84,7 @@ void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { // and a message data. // The signature and public key are assumed to be in G1 and G2 respectively. This // function only checks the pairing equality. -static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const int len) { - +static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const int len) { ep_t elemsG1[2]; ep2_t elemsG2[2]; @@ -109,7 +99,7 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i // elemsG2[1] = pk ep2_new(elemsG2[1]); - ep2_copy(elemsG2[1], (ep2_st*)pk); + ep2_copy(elemsG2[1], (ep2_st*)pk); #if DOUBLE_PAIRING // elemsG2[0] = -g2 @@ -118,12 +108,14 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i fp12_t pair; fp12_new(&pair); + if (core_get()->code != RLC_OK) printf("EUUUUUUUU\n"); // double pairing with Optimal Ate pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), 2); // compare the result to 1 int res = fp12_cmp_dig(pair, 1); + #elif SINGLE_PAIRING fp12_t pair1, pair2; fp12_new(&pair1); fp12_new(&pair2); @@ -342,12 +334,14 @@ int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len) // deserialize the signature into a curve point int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN); - if (read_ret != RLC_OK) + if (read_ret != RLC_OK) { return read_ret; + } // check s is in G1 - if (check_membership_G1(s) != VALID) // only enabled if MEMBERSHIP_CHECK==1 + if (check_membership_G1(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1 return INVALID; + } return bls_verify_ep(pk, s, data, len); } diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 5e4a13564bd..579700a183e 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -151,7 +151,7 @@ func TestBLSEncodeDecode(t *testing.T) { // specific tests for BLS // zero private key - /*skBytes := make([]byte, PrKeyLenBLSBLS12381) + skBytes := make([]byte, PrKeyLenBLSBLS12381) sk, err := DecodePrivateKey(BLSBLS12381, skBytes) require.Error(t, err, "decoding identity private key should fail") assert.True(t, IsInvalidInputsError(err)) @@ -195,7 +195,7 @@ func TestBLSEncodeDecode(t *testing.T) { invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D") require.NoError(t, err) _, err = DecodePublicKey(BLSBLS12381, invalidPk2) - assert.Error(t, err)*/ + assert.Error(t, err) } // TestBLSEquals tests equal for BLS keys diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index df3da1a108d..ef4630a7341 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -5,7 +5,7 @@ package crypto // #cgo CFLAGS: // #include "bls_thresholdsign_include.h" -/*import "C" +import "C" import ( "fmt" @@ -409,14 +409,11 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat signers := make([]index, 0, len(s.shares)) for index, share := range s.shares { shares = append(shares, share...) - signers = append(signers, index) + signers = append(signers, index+1) } - - - // Lagrange Interpolate at point 0 - result := C.G1_lagrangeInterpolateAtZero( + result := C.G1_lagrangeInterpolateAtZero_serialized( (*C.uchar)(&thresholdSignature[0]), (*C.uchar)(&shares[0]), (*C.uint8_t)(&signers[0]), (C.int)(s.threshold+1)) @@ -457,8 +454,6 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat func BLSReconstructThresholdSignature(size int, threshold int, shares []Signature, signers []int) (Signature, error) { - - if size < ThresholdSignMinSize || size > ThresholdSignMaxSize { return nil, invalidInputsErrorf( "size should be between %d and %d", @@ -501,12 +496,12 @@ func BLSReconstructThresholdSignature(size int, threshold int, "%d is a duplicate signer", index(signers[i])) } m[index(signers[i])] = true - indexSigners = append(indexSigners, index(signers[i])) + indexSigners = append(indexSigners, index(signers[i])+1) } thresholdSignature := make([]byte, signatureLengthBLSBLS12381) // Lagrange Interpolate at point 0 - if C.G1_lagrangeInterpolateAtZero( + if C.G1_lagrangeInterpolateAtZero_serialized( (*C.uchar)(&thresholdSignature[0]), (*C.uchar)(&flatShares[0]), (*C.uint8_t)(&indexSigners[0]), (C.int)(threshold+1), @@ -558,9 +553,6 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, threshold) } - - - // the scalars x and G2 points y x := make([]scalar, size) y := make([]pointG2, size) @@ -604,4 +596,4 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, // are sampled uniformly at random. The probability of // generating an identity key is therefore negligible. return skShares, pkShares, pkGroup, nil -}*/ +} diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index 94a12a024d7..68f5005ace4 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -2,122 +2,154 @@ #include "bls_thresholdsign_include.h" -// Computes the Lagrange coefficient L(i+1) at 0 with regards to the range [signers(0)+1..signers(t)+1] -// and stores it in res, where t is the degree of the polynomial P -static void Zr_lagrangeCoefficientAtZero(bn_t res, const int i, const uint8_t* signers, const int len){ - // r is the order of G1 and G2 - bn_t r, r_2; - bn_new(r); - g2_get_ord(r); - // (r-2) is needed to compute the inverse in Fr - // using little Fermat theorem - bn_new(r_2); - bn_sub_dig(r_2, r, 2); - //#define MOD_METHOD MONTY - #define MOD_METHOD BASIC - - #if MOD_METHOD == MONTY - bn_t u; - bn_new(u) - // Montgomery reduction constant - // TODO: hardcode u - bn_mod_pre_monty(u, r); - #endif +// the highest index of a threshold participant +#define MAX_IND 255 +#define MAX_IND_BITS 8 // equal to ceiling(log_2(MAX_IND)) + +// Computes the Lagrange coefficient L_i(0) in Fr with regards to the range [indices(0)..indices(t)] +// and stores it in `res`, where t is the degree of the polynomial P. +// `len` is equal to `t+1` where `t` is the polynomial degree. +static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t indices[], const int len){ + + // coefficient is computed as N * D^(-1) + Fr numerator; // eventually would represent N*R^k + Fr denominator; // eventually would represent D*R^k + + // Initialize N and D to Montgomery constant R + // TODO: hardcode R and add Fr_copy function + Fr_copy(&numerator, (Fr*)BLS12_381_rRR); + Fr_copy(&denominator, (Fr*)BLS12_381_rRR); + Fr_from_montg(&numerator, &numerator); + Fr_from_montg(&denominator, &denominator); + + // sign of D: 1 for positive and 0 for negative + int sign = 1; + + // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately 64/MAX_IND_BITS) + // this means we can multiply up to (k) indices in a limb (64 bits) without overflowing. + #define MAX_IND_LOOPS 64/MAX_IND_BITS + + // choose inversion algorithm used for denominator + #define FERMAT_INVERSION 0 + #define EUCLIDEAN_INVERSION (FERMAT_INVERSION^1) - // temp buffers - bn_t acc, inv, base, numerator; - bn_new(inv); - bn_new(base); - bn_new_size(base, BITS_TO_DIGITS(Fr_BITS)) - bn_new(acc); - bn_new(numerator); - bn_new_size(acc, BITS_TO_DIGITS(3*Fr_BITS)); - - // the accumulator of the largarnge coeffiecient - // the sign (sign of acc) is equal to 1 if acc is positive, 0 otherwise - bn_set_dig(acc, 1); - int sign = 1; - - // loops is the maximum number of loops that takes the accumulator to - // overflow modulo r, mainly the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < r const int loops = MAX_IND_LOOPS; int k,j = 0; + Fr tmp; while (j 32) { - limbs_from_be_bytes(digit, bytes -= 32, 32); - from_mont_256(digit, digit, BLS12_381_r, r0); - mul_mont_sparse_256(digit, digit, radix, BLS12_381_r, r0); - add_mod_256(out, out, digit, BLS12_381_r); - mul_mont_sparse_256(radix, radix, BLS12_381_rRR, BLS12_381_r, r0); - n -= 32; - } - limbs_from_be_bytes(digit, bytes -= n, n); - from_mont_256(digit, digit, BLS12_381_r, r0); - mul_mont_sparse_256(digit, digit, radix, BLS12_381_r, r0); - add_mod_256(out, out, digit, BLS12_381_r); - - vec_zero(digit, sizeof(digit)); } \ No newline at end of file diff --git a/crypto/dkg.go b/crypto/dkg.go index 3e369b77fa4..1cdf87a128e 100644 --- a/crypto/dkg.go +++ b/crypto/dkg.go @@ -1,6 +1,5 @@ package crypto -/* import ( "errors" "fmt" @@ -235,4 +234,4 @@ type DKGProcessor interface { // do so, the protocol can be broken. // log describes the misbehavior. FlagMisbehavior(participant int, log string) -}*/ +} diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 50923ee9087..fa4729c84e2 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -6,51 +6,37 @@ #define N_max 250 #define N_bits_max 8 // log(250) #define T_max ((N_max-1)/2) - +/* // computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r) // r being the order of G1 // writes P(x) in out and P(x).g2 in y if y is non NULL // x being a small integer void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){ - bn_t image; - bn_new(image); - Fr_polynomialImage(image, y, a, a_size, x); + Fr image; + Fr_polynomialImage(&image, y, a, a_size, x); // exports the result - const int out_size = Fr_BYTES; - bn_write_bin(out, out_size, image); - bn_free(image); -} + Fr_write_bytes(out, &image); +}*/ -// computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r) -// r being the order of G1 -// writes P(x) in out and P(x).g2 in y if y is non NULL -// x being a small integer -void Fr_polynomialImage(bn_t image, ep2_t y, const bn_st *a, const int a_size, const byte x){ - bn_t r; - bn_new(r); - g2_get_ord(r); - - // temp variables - bn_t acc; - bn_new(acc); - bn_new_size(acc, BITS_TO_DIGITS(Fr_BITS+8+1)); - bn_set_dig(acc, 0); +// computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. +// a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255. +// The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL +void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const byte x){ + Fr_set_zero(image); + // convert `x` to Montgomery form + Fr xR; + Fr_set_limb(&xR, (limb_t)x); + Fr_to_montg(&xR, &xR); - for (int i=a_size-1; i >= 0; i--) { - bn_mul_dig(acc, acc, x); - // Use basic reduction as it's an 9-bits reduction - // in the worst case (|acc|<|r|+9 ) - bn_mod_basic(acc, acc, r); - bn_add(acc, acc, &a[i]); + for (int i = a_size-1; i >= 0; i--) { + Fr_mul_montg(image, image, &xR); + Fr_add(image, image, &a[i]); // image is in normal form } - // export the result - bn_mod_basic(image, acc, r); - // compute y = P(x).g2 - if (y) g2_mul_gen(y, acc); - - bn_free(acc) - bn_free(r); + if (y) { + bn_st* tmp = Fr_blst_to_relic(image); + g2_mul_gen(y, tmp); + } } // computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 8a92cd5dff3..2e7688b11fa 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -203,13 +203,14 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error) return nil, nil, nil, dkgFailureErrorf("group private key is identity and is therefore invalid") } return x, Y, y, nil -} +}*/ const ( complaintSize = 1 complaintAnswerSize = 1 + PrKeyLenBLSBLS12381 ) +/* // HandleBroadcastMsg processes a new broadcasted message received by the current participant. // orig is the message origin index // diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index f50b143961d..34c81053fa7 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -5,12 +5,8 @@ #include "bls12381_utils.h" -// the highest index of a DKG participant -#define MAX_IND 255 -#define MAX_IND_BITS 8 - void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x); -void Fr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x); +void Fr_polynomialImage(Fr* out, ep2_t y, const Fr* a, const int a_size, const byte x); void G2_polynomialImages(ep2_st* y, const int len_y, const ep2_st* A, const int len_A); void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len); int ep2_vector_read_bin(ep2_st* A, const byte* src, const int len); diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index 104cb8ef56f..fc1de49d225 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -3,26 +3,26 @@ package crypto -/* import ( "fmt" mrand "math/rand" "sync" "testing" - "time" + _ "time" log "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" + _ "github.com/stretchr/testify/require" ) var gt *testing.T +/* func TestDKG(t *testing.T) { t.Run("FeldmanVSSSimple", testFeldmanVSSSimple) t.Run("FeldmanVSSQual", testFeldmanVSSQual) t.Run("JointFeldman", testJointFeldman) -} +}*/ // optimal threshold (t) to allow the largest number of malicious participants (m) // assuming the protocol requires: @@ -33,6 +33,7 @@ func optimalThreshold(size int) int { return (size - 1) / 2 } +/* // Testing the happy path of Feldman VSS by simulating a network of n participants func testFeldmanVSSSimple(t *testing.T) { log.SetLevel(log.ErrorLevel) @@ -43,7 +44,7 @@ func testFeldmanVSSSimple(t *testing.T) { dkgCommonTest(t, feldmanVSS, n, threshold, happyPath) }) } -} +}*/ type testCase int @@ -68,7 +69,7 @@ const ( invalidSharesComplainTrigger invalidComplaintAnswerBroadcast duplicatedSendAndBroadcast -) +) /* // Testing Feldman VSS with the qualification system by simulating a network of n participants func testFeldmanVSSQual(t *testing.T) { @@ -441,7 +442,7 @@ func timeoutPostProcess(processors []testDKGProcessor, t *testing.T, phase int) }(i) } } -} +}*/ // implements DKGProcessor interface type testDKGProcessor struct { @@ -767,7 +768,7 @@ func TestDKGErrorTypes(t *testing.T) { assert.False(t, IsDKGInvalidStateTransitionError(otherError)) assert.False(t, IsDKGInvalidStateTransitionError(nil)) }) -} +} /* func TestDKGTransitionErrors(t *testing.T) { n := 5 diff --git a/crypto/thresholdsign.go b/crypto/thresholdsign.go index ebb814dee5b..2dae7061b76 100644 --- a/crypto/thresholdsign.go +++ b/crypto/thresholdsign.go @@ -16,10 +16,10 @@ import ( // the input threshold value (t) should be set to t = floor((n-1)/2). const ( -// ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol -// ThresholdSignMinSize = MinimumThreshold + 1 -// ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol -// ThresholdSignMaxSize = DKGMaxSize + // ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol + ThresholdSignMinSize = MinimumThreshold + 1 + // ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol + ThresholdSignMaxSize = DKGMaxSize ) // ThresholdSignatureInspector is an inspector of the threshold signature protocol. From 8b49d0ac6979a0321ce4db21db15050f44819301 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 27 Feb 2023 13:37:21 -0600 Subject: [PATCH 013/200] add Fermat inversion - Lagrange interpolation works with Euclidean inversion and multiple cleanups --- crypto/bls12381_hashtocurve.c | 12 +--- crypto/bls12381_utils.c | 102 +++++++++++++++++++++++++++---- crypto/bls12381_utils.h | 21 ++++--- crypto/bls_core.c | 2 - crypto/bls_thresholdsign_core.c | 54 ++++------------ crypto/bls_thresholdsign_test.go | 35 +++++++++-- crypto/blst_include.h | 6 -- crypto/blst_tools.c | 24 -------- 8 files changed, 147 insertions(+), 109 deletions(-) delete mode 100644 crypto/blst_tools.c diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c index 62053d7ed22..3e8217d42e5 100644 --- a/crypto/bls12381_hashtocurve.c +++ b/crypto/bls12381_hashtocurve.c @@ -10,7 +10,7 @@ extern prec_st* bls_prec; // These constants are taken from https://github.com/kwantam/bls12-381_hash // and converted to the Mongtomery domain. // Copyright 2019 Riad S. Wahby -const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_DIGITS] = { +const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS] = { {0x4d18b6f3af00131c, 0x19fa219793fee28c, 0x3f2885f1467f19ae, 0x23dcea34f2ffb304, 0xd15b58d2ffc00054, 0x0913be200a20bef4,}, {0x898985385cdbbd8b, 0x3c79e43cc7d966aa, 0x1597e193f4cd233a, @@ -37,7 +37,7 @@ const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_DIGITS] = { 0x464170142a1009eb, 0xb14f01aadb30be2f, 0x18ae6a856f40715d,}, }; -const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_DIGITS] = { +const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS] = { {0x2b567ff3e2837267, 0x1d4d9e57b958a767, 0xce028fea04bd7373, 0xcc31a30a0b6cd3df, 0x7d7b18a682692693, 0x0d300744d42a0310,}, {0x99c2555fa542493f, 0xfe7f53cc4874f878, 0x5df0608b8f97608a, @@ -335,12 +335,4 @@ void map_to_G1(ep_t h, const byte* data, const int len) { #elif hashToPoint==RELIC_SSWU ep_map_from_field(h, data, len); #endif - - /*Fr a, b; - Fr_set_limb(&a, 1); - Fr_print_("a", &a); - Fr_inv_montg_eucl(&b,&a); - Fr_print_("b", &b); - Fr_from_montg(&b, &b); - Fr_print_("b", &b); */ } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index c8dfb808827..dc8b642de66 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -24,6 +24,16 @@ int get_Fr_BYTES() { } // Fr utilities +// Montgomery constant R related to the curve order r +const Fr BLS12_381_rR = (Fr){ /* (1<<256)%r */ + TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), + TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe) +}; + +/*0x1824b159acc5056f +0x998c4fefecbc4ff5 +0x5884b7fa00034802 +0x00000001fffffffe*/ // TODO: temp utility function to delete bn_st* Fr_blst_to_relic(const Fr* x) { @@ -51,7 +61,7 @@ void Fr_set_limb(Fr* a, const limb_t l){ *((limb_t*)a) = l; } -void Fr_copy(Fr* res, Fr* a) { +void Fr_copy(Fr* res, const Fr* a) { vec_copy((byte*)res, (byte*)a, Fr_BYTES); } @@ -72,19 +82,26 @@ void Fr_neg(Fr *res, const Fr *a) { cneg_mod_256((limb_t*)res, (limb_t*)a, 1, BLS12_381_r); } +// res = a*b*R^(-1) void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b) { mul_mont_sparse_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r, r0); } +// res = a^2 * R^(-1) +void Fr_squ_montg(Fr *res, const Fr *a) { + sqr_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0); +} + +// res = a*R void Fr_to_montg(Fr *res, const Fr *a) { mul_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_rRR, BLS12_381_r, r0); } +// res = a*R^(-1) void Fr_from_montg(Fr *res, const Fr *a) { from_mont_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0); } -// result is in Montgomery form // res = a^(-1)*R void Fr_inv_montg_eucl(Fr *res, const Fr *a) { // copied and modified from BLST code @@ -98,8 +115,49 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) { redc_mont_256((limb_t*)res, temp, BLS12_381_r, r0); } -void Fr_inv_montg_expo(Fr *res, const Fr *a) { - // TODO: +// result is in Montgomery form if base is in montgomery form +// if base = b*R, res = b^expo * R +// In general, res = base^expo * R^(-expo+1) +// `expo` is encoded as a little-endian limb_t table of length `expo_len`. +void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) { + // mask of the most significant bit + const limb_t msb_mask = (limb_t)1<<((sizeof(limb_t)<<3)-1); + limb_t mask = msb_mask; + int index = 0; + + expo += expo_len; + // Treat most significant zero limbs + while((index < expo_len) && (*(--expo) == 0)) { + index++; + } + // Treat the most significant zero bits + while((*expo & mask) == 0) { + mask >>= 1; + } + // Treat the first `1` bit + Fr_copy(res, base); + mask >>= 1; + // Scan all limbs of the exponent + for ( ; index < expo_len; expo--) { + // Scan all bits + for ( ; mask != 0 ; mask >>= 1 ) { + // square + Fr_squ_montg(res, res); + // multiply + if (*expo & mask) { + Fr_mul_montg(res, res ,base); + } + } + mask = msb_mask; + index++; + } +} + +void Fr_inv_exp_montg(Fr *res, const Fr *a) { + Fr r_2; + Fr_copy(&r_2, (Fr*)BLS12_381_r); + r_2.limbs[0] -= 2; + Fr_exp_montg(res, a, (limb_t*)&r_2, 4); } // computes the sum of the array elements and writes the sum in jointx @@ -110,6 +168,24 @@ void Fr_sum_vector(Fr* jointx, const Fr x[], const int len) { } } +// internal type of BLST `pow256` uses bytes little endian. +// input is bytes big endian as used by Flow crypto lib external scalars. +static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES]) +{ + unsigned char* b = (unsigned char*)a + Fr_BYTES - 1; + if ((uptr_t)ret == (uptr_t)a) { // swap in place + for (int i=0; ibeta); - bn_read_raw(&bls_prec->beta, beta_data, Fp_DIGITS); + bn_read_raw(&bls_prec->beta, beta_data, Fp_LIMBS); bn_new(&bls_prec->z2_1_by3); bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2); #endif @@ -329,8 +405,8 @@ void bytes_print_(char* s, byte* data, int len) { void Fr_print_(char* s, Fr* a) { printf("[%s]:\n", s); - limb_t* p = (limb_t*)(a) + Fr_DIGITS; - for (int i=0; i>3) -#define BITS_TO_DIGITS(x) ((x+63)>>6) -#define BYTES_TO_DIGITS(x) ((x+7)>>3) -#define DIGITS_TO_BYTES(x) ((x)<<3) +#define BITS_TO_LIMBS(x) ((x+63)>>6) +#define BYTES_TO_LIMBS(x) ((x+7)>>3) +#define LIMBS_TO_BYTES(x) ((x)<<3) #define MIN(a,b) ((a)>(b)?(b):(a)) // Fields and Group serialization lengths #define SEC_BITS 128 #define Fp_BITS 381 #define Fp2_BYTES (2*Fp_BYTES) -#define Fp_DIGITS BITS_TO_DIGITS(Fp_BITS) -#define Fp_BYTES DIGITS_TO_BYTES(Fp_DIGITS) // BLST implements Fp as a limb array +#define Fp_LIMBS BITS_TO_LIMBS(Fp_BITS) +#define Fp_BYTES LIMBS_TO_BYTES(Fp_LIMBS) // BLST implements Fp as a limb array #define Fr_BITS 255 -#define Fr_DIGITS BITS_TO_DIGITS(Fr_BITS) -#define Fr_BYTES DIGITS_TO_BYTES(Fr_DIGITS) // BLST implements Fr as a limb array +#define Fr_LIMBS BITS_TO_LIMBS(Fr_BITS) +#define Fr_BYTES LIMBS_TO_BYTES(Fr_LIMBS) // BLST implements Fr as a limb array #define G1_BYTES (2*Fp_BYTES) #define G2_BYTES (2*Fp2_BYTES) @@ -92,20 +92,23 @@ int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*); void map_to_G1(ep_t, const byte*, const int); // Fr utilities +extern const Fr BLS12_381_rR; bool_t Fr_is_zero(const Fr* a); bool_t Fr_is_equal(const Fr* a, const Fr* b); void Fr_set_limb(Fr*, const limb_t); -void Fr_copy(Fr*, Fr*); +void Fr_copy(Fr*, const Fr*); void Fr_set_zero(Fr*); void Fr_add(Fr *res, const Fr *a, const Fr *b); void Fr_sub(Fr *res, const Fr *a, const Fr *b); void Fr_neg(Fr *res, const Fr *a); void Fr_sum_vector(Fr*, const Fr x[], const int); void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b); +void Fr_squ_montg(Fr *res, const Fr *a); void Fr_to_montg(Fr *res, const Fr *a); void Fr_from_montg(Fr *res, const Fr *a); +void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len); void Fr_inv_montg_eucl(Fr *res, const Fr *a); -void Fr_inv_montg_expo(Fr *res, const Fr *a); +void Fr_inv_exp_montg(Fr *res, const Fr *a); BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len); BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len); void Fr_write_bytes(uint8_t *bin, const Fr* a); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index ecec6ae346c..a1d47c73f17 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -108,14 +108,12 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i fp12_t pair; fp12_new(&pair); - if (core_get()->code != RLC_OK) printf("EUUUUUUUU\n"); // double pairing with Optimal Ate pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), 2); // compare the result to 1 int res = fp12_cmp_dig(pair, 1); - #elif SINGLE_PAIRING fp12_t pair1, pair2; fp12_new(&pair1); fp12_new(&pair2); diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index 68f5005ace4..e6f94716d9b 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -16,23 +16,16 @@ static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t ind Fr denominator; // eventually would represent D*R^k // Initialize N and D to Montgomery constant R - // TODO: hardcode R and add Fr_copy function - Fr_copy(&numerator, (Fr*)BLS12_381_rRR); - Fr_copy(&denominator, (Fr*)BLS12_381_rRR); - Fr_from_montg(&numerator, &numerator); - Fr_from_montg(&denominator, &denominator); + // TODO: hardcode R + Fr_copy(&numerator, &BLS12_381_rR); + Fr_copy(&denominator, &BLS12_381_rR); - // sign of D: 1 for positive and 0 for negative - int sign = 1; + // sign of D: 0 for positive and 1 for negative + int sign = 0; // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately 64/MAX_IND_BITS) // this means we can multiply up to (k) indices in a limb (64 bits) without overflowing. #define MAX_IND_LOOPS 64/MAX_IND_BITS - - // choose inversion algorithm used for denominator - #define FERMAT_INVERSION 0 - #define EUCLIDEAN_INVERSION (FERMAT_INVERSION^1) - const int loops = MAX_IND_LOOPS; int k,j = 0; Fr tmp; @@ -50,47 +43,26 @@ static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t ind } limb_numerator *= indices[j]; } + // numerator and denominator are both computed in Montgomery form. // update numerator Fr_set_limb(&tmp, limb_numerator); // L_N - #if EUCLIDEAN_INVERSION == 1 - // numerator and denominator are both computed in Montgomery form. - Fr_to_montg(&tmp, &tmp); // L_N*R - #endif + Fr_to_montg(&tmp, &tmp); // L_N*R Fr_mul_montg(&numerator, &numerator, &tmp); // N*R // update denominator Fr_set_limb(&tmp, limb_denominator); // L_D - #if EUCLIDEAN_INVERSION == 1 - // keep numertaor and denominator are both computed in Montgomery form. - Fr_to_montg(&tmp, &tmp); // L_D*R - #endif + Fr_to_montg(&tmp, &tmp); // L_D*R Fr_mul_montg(&denominator, &denominator, &tmp); // D*R - //printf("%d--%lld--%lld\n", sign, limb_numerator, limb_denominator); } - if (!sign) { + if (sign) { Fr_neg(&denominator, &denominator); } - #if EUCLIDEAN_INVERSION == 1 - // at this point, denominator = D*R , numertaor = N*R - // inversion - Fr_inv_montg_eucl(&denominator, &denominator); // (DR)^(-1)*R = D^(-1) - Fr_mul_montg(res, &numerator, &denominator); // N*D^(-1) - #endif - - //printf("%d:LI(%d):\n", i, indices[i]); - //Fr_print_("res", res); - - #if FERMAT_INVERSION == 1 - // at this point, denominator = D*R^c , numertaor = N*R^c - // (c is the nummber of mult_mont, but the exact value isn't relevant) - // inversion inv(xR) = x^(-1)R - Fr_inv_montg_expo(&denominator, &denominator); // inv(D*R^c) = inv(D*R^(c-1)*R) = D^(-1)*R^(1-c)*R - Fr_mul_montg(&numerator, &numerator, &denominator); //N*D^(-1)*R - Fr_from_montg(res, &numerator); //N*D^(-1) - #endif + // at this point, denominator = D*R , numertaor = N*R + // inversion inv(x) = x^(-1)R + Fr_inv_montg_eucl(&denominator, &denominator); // (DR)^(-1)*R = D^(-1) + Fr_mul_montg(res, &numerator, &denominator); // N*D^(-1) } - // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] // and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`. // `len` is equal to `t+1` where `t` is the polynomial degree. diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 3d05177369c..04603a70a55 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -546,7 +546,7 @@ func testCentralizedStatelessAPI(t *testing.T) { n := 10 for threshold := MinimumThreshold; threshold < n; threshold++ { // generate threshold keys - r := int64(1677308758239641000) //time.Now().UnixNano() + r := time.Now().UnixNano() mrand.Seed(r) t.Log(r) seed := make([]byte, SeedMinLenDKG) @@ -607,7 +607,7 @@ func testCentralizedStatelessAPI(t *testing.T) { } } -/*func BenchmarkSimpleKeyGen(b *testing.B) { +func BenchmarkSimpleKeyGen(b *testing.B) { n := 60 seed := make([]byte, SeedMinLenDKG) _, _ = rand.Read(seed) @@ -616,6 +616,33 @@ func testCentralizedStatelessAPI(t *testing.T) { _, _, _, _ = BLSThresholdKeyGen(n, optimalThreshold(n), seed) } b.StopTimer() -}*/ +} -// TODO: add benchmark for signature reconstruction +func BenchmarkSignatureReconstruction(b *testing.B) { + n := 60 + seed := make([]byte, SeedMinLenDKG) + _, _ = rand.Read(seed) + threshold := 40 + // generate threshold keys + skShares, _, _, err := BLSThresholdKeyGen(n, threshold, seed) + require.NoError(b, err) + // signature hasher + kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag) + // generate signature shares + signShares := make([]Signature, 0, threshold+1) + signers := make([]int, 0, threshold+1) + // create (t+1) signatures of the first randomly chosen signers + for i := 0; i < threshold+1; i++ { + signers = append(signers, i) + share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) + require.NoError(b, err) + signShares = append(signShares, share) + } + // reconstruct + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers) + require.NoError(b, err) + } + b.StopTimer() +} diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 0fd710d1579..d7a7cdf4367 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -24,10 +24,4 @@ typedef POINTonE1 G1; // Subroup G1 in E2 typedef POINTonE2 G2; - -// extra functions and tools that are needed by the Flow crypto library -// and that are not exported in the desired form by BLST - -void pow256_from_be_bytes(pow256 ret, const unsigned char a[32]); - #endif \ No newline at end of file diff --git a/crypto/blst_tools.c b/crypto/blst_tools.c deleted file mode 100644 index 81fba31ac9e..00000000000 --- a/crypto/blst_tools.c +++ /dev/null @@ -1,24 +0,0 @@ -// +build relic - -// extra tools to use BLST low level that are needed by the Flow crypto library - -#include "blst_include.h" -#include "bls12381_utils.h" - -// internal type of BLST `pow256` uses bytes little endian. -// input is bytes big endian as used by Flow crypto lib external scalars. -void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES]) -{ - unsigned char* b = (unsigned char*)a + Fr_BYTES - 1; - if ((uptr_t)ret == (uptr_t)a) { // swap in place - for (int i=0; i Date: Mon, 27 Feb 2023 14:37:28 -0600 Subject: [PATCH 014/200] DKG feldmanVSS with Fr type --- crypto/bls.go | 18 ++++-------------- crypto/bls12381_utils.go | 24 +++++++++++++++++++++++- crypto/dkg_core.c | 28 +++++++++++----------------- crypto/dkg_feldmanvss.go | 18 +++++++----------- crypto/dkg_feldmanvssq.go | 24 +++++++++--------------- crypto/dkg_include.h | 4 ++-- crypto/dkg_test.go | 35 +++++++++++++++++------------------ 7 files changed, 73 insertions(+), 78 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 48996e0ae9d..447ba6f532e 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -325,22 +325,12 @@ func BLSInvalidSignature() Signature { func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, error) { sk := newPrKeyBLSBLS12381(nil) - read := C.Fr_star_read_bytes( - (*C.Fr)(&sk.scalar), - (*C.uchar)(&privateKeyBytes[0]), - (C.int)(prKeyLengthBLSBLS12381)) + err := readScalarFrStar(&sk.scalar, privateKeyBytes) - switch int(read) { - case blst_valid: - return sk, nil - case blst_bad_encoding: - return nil, invalidInputsErrorf("input length must be %d, got %d", - prKeyLengthBLSBLS12381, len(privateKeyBytes)) - case blst_bad_scalar: - return nil, invalidInputsErrorf("the private key is not in the correct range for the BLS12-381 curve") - default: - return nil, invalidInputsErrorf("reading the private key failed") + if err != nil { + return nil, fmt.Errorf("failed to read the private key: %w", err) } + return sk, nil } // decodePublicKey decodes a slice of bytes into a public key. diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index d569bf0cc38..cbdc718e364 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -159,7 +159,7 @@ func mapToZr(x *scalar, src []byte) bool { return false } -// writeScalar writes a G2 point in a slice of bytes +// writeScalar writes a scalar in a slice of bytes func writeScalar(dest []byte, x *scalar) { C.Fr_write_bytes((*C.uchar)(&dest[0]), (*C.Fr)(x)) } @@ -184,6 +184,28 @@ func writePointG1(dest []byte, a *pointG1) { ) } +// read an Fr* element from a byte slice +// and stores it into a `scalar` type element. +func readScalarFrStar(a *scalar, src []byte) error { + read := C.Fr_star_read_bytes( + (*C.Fr)(a), + (*C.uchar)(&src[0]), + (C.int)(len(src))) + + switch int(read) { + case blst_valid: + return nil + case blst_bad_encoding: + return invalidInputsErrorf("input length must be %d, got %d", + frBytesLen, len(src)) + case blst_bad_scalar: + return invalidInputsErrorf("scalar is not in the correct range w.r.t the BLS12-381 curve") + default: + return invalidInputsErrorf("reading the scalar failed") + } + +} + // readPointG2 reads a G2 point from a slice of bytes // The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index fa4729c84e2..34d6addbffb 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -6,17 +6,17 @@ #define N_max 250 #define N_bits_max 8 // log(250) #define T_max ((N_max-1)/2) -/* + // computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r) // r being the order of G1 // writes P(x) in out and P(x).g2 in y if y is non NULL // x being a small integer -void Fr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){ +void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size, const byte x){ Fr image; Fr_polynomialImage(&image, y, a, a_size, x); // exports the result Fr_write_bytes(out, &image); -}*/ +} // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. // a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255. @@ -42,8 +42,7 @@ void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const // computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 // and stores the point in y // r is the order of G2 -static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A, - const byte x, const bn_t r){ +static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A, const byte x){ bn_t bn_x; bn_new(bn_x); @@ -54,24 +53,18 @@ static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A, ep2_add_projc(y, y, (ep2_st*)&A[i]); } - ep2_norm(y, y); // not necessary but left here to optimize the + ep2_norm(y, y); // not necessary but called to optimize the // multiple pairing computations with the same public key bn_free(bn_x); } -// compute the participants public keys from the verification vector -// y[i] = Q(i+1) for all participants i, with: -// Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 +// computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y) +// where Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2[X] void G2_polynomialImages(ep2_st *y, const int len_y, const ep2_st* A, const int len_A) { - // order r - bn_t r; - bn_new(r); - g2_get_ord(r); for (byte i=0; i threshold || (n-r1) <= threshold)) || - (dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer + if false { //(dkg == jointFeldman && (r1 > threshold || (n-r1) <= threshold)) || + //(dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer t.Logf("dkg failed, there are %d disqualified participants\n", r1) // DKG failed, check for final errors for i := r1; i < n; i++ { @@ -442,7 +441,7 @@ func timeoutPostProcess(processors []testDKGProcessor, t *testing.T, phase int) }(i) } } -}*/ +} // implements DKGProcessor interface type testDKGProcessor struct { From 2bbea260fb2d16a225da3549263353755a401e46 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 27 Feb 2023 19:36:52 -0600 Subject: [PATCH 015/200] enable all DKG protocols to work with new Fr type --- crypto/bls_thresholdsign_test.go | 8 ++++---- crypto/dkg_feldmanvssq.go | 10 ++++------ crypto/dkg_jointfeldman.go | 5 ++--- crypto/dkg_test.go | 22 +++++++++++----------- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 04603a70a55..5473b454827 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -21,8 +21,8 @@ func TestBLSThresholdSignature(t *testing.T) { t.Run("centralized_stateless_keygen", testCentralizedStatelessAPI) // stateful API t.Run("centralized_stateful_keygen", testCentralizedStatefulAPI) - //t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS) - //t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case + t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS) + t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case } const thresholdSignatureTag = "random tag" @@ -314,7 +314,7 @@ func testCentralizedStatefulAPI(t *testing.T) { // Distributed Threshold Signature stateful api test // keys are generated using simple Feldman VSS -/*func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) { +func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) { log.SetLevel(log.ErrorLevel) log.Info("DKG starts") gt = t @@ -439,7 +439,7 @@ func testDistributedStatefulAPI_JointFeldman(t *testing.T) { // synchronize the main thread to end TS sync.Wait() } -}*/ +} // This is a testing function // It simulates processing incoming messages by a participant during DKG diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 53a1dc4278b..cc0b94962df 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -3,7 +3,6 @@ package crypto -/* // #cgo CFLAGS: // #include "dkg_include.h" import "C" @@ -203,14 +202,13 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error) return nil, nil, nil, dkgFailureErrorf("group private key is identity and is therefore invalid") } return x, Y, y, nil -}*/ +} const ( complaintSize = 1 complaintAnswerSize = 1 + PrKeyLenBLSBLS12381 ) -/* // HandleBroadcastMsg processes a new broadcasted message received by the current participant. // orig is the message origin index // @@ -624,7 +622,7 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) } // read the complainer private share - err := readScalarFrStar(&s.complaints[complainer].answer, data[1]) + err := readScalarFrStar(&s.complaints[complainer].answer, data[1:]) if err != nil { s.disqualified = true s.processor.Disqualify(int(s.dealerIndex), @@ -645,7 +643,7 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) // flag check is a sanity check if c.received { // read the complainer private share - err := readScalarFrStar(&c.answer, data[1]) + err := readScalarFrStar(&c.answer, data[1:]) if err != nil { s.disqualified = true s.processor.Disqualify(int(s.dealerIndex), @@ -666,4 +664,4 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) s.x = c.answer } } -}*/ +} diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index be8b2c9f70f..bef857fba37 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -3,7 +3,6 @@ package crypto -/* // #cgo CFLAGS: // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "dkg_include.h" @@ -203,7 +202,7 @@ func (s *JointFeldmanState) End() (PrivateKey, PublicKey, []PublicKey, error) { jointx, jointPublicKey, jointy := s.sumUpQualifiedKeys(s.size - disqualifiedTotal) // private key of the current participant - x := newPrKeyBLSBLS12381(&jointx) + x := newPrKeyBLSBLS12381(jointx) // Group public key Y := newPubKeyBLSBLS12381(jointPublicKey) @@ -338,4 +337,4 @@ func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointG2 } } return qualifiedx, qualifiedPubKey, qualifiedy -}*/ +} diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index d3347df0c93..0d32a3fd1ec 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -19,7 +19,7 @@ var gt *testing.T func TestDKG(t *testing.T) { t.Run("FeldmanVSSSimple", testFeldmanVSSSimple) - //t.Run("FeldmanVSSQual", testFeldmanVSSQual) + t.Run("FeldmanVSSQual", testFeldmanVSSQual) //t.Run("JointFeldman", testJointFeldman) } @@ -67,7 +67,7 @@ const ( invalidSharesComplainTrigger invalidComplaintAnswerBroadcast duplicatedSendAndBroadcast -) /* +) // Testing Feldman VSS with the qualification system by simulating a network of n participants func testFeldmanVSSQual(t *testing.T) { @@ -96,6 +96,7 @@ func testFeldmanVSSQual(t *testing.T) { // are only tested within joint feldman. } +/* // Testing JointFeldman by simulating a network of n participants func testJointFeldman(t *testing.T) { log.SetLevel(log.ErrorLevel) @@ -137,8 +138,8 @@ func testJointFeldman(t *testing.T) { // Supported Key Generation protocols const ( feldmanVSS = iota - /*feldmanVSSQual - jointFeldman*/ + feldmanVSSQual + jointFeldman ) func newDKG(dkg int, size int, threshold int, myIndex int, @@ -146,10 +147,10 @@ func newDKG(dkg int, size int, threshold int, myIndex int, switch dkg { case feldmanVSS: return NewFeldmanVSS(size, threshold, myIndex, processor, dealerIndex) - /*case feldmanVSSQual: + case feldmanVSSQual: return NewFeldmanVSSQual(size, threshold, myIndex, processor, dealerIndex) case jointFeldman: - return NewJointFeldman(size, threshold, myIndex, processor)*/ + return NewJointFeldman(size, threshold, myIndex, processor) default: return nil, fmt.Errorf("non supported protocol") } @@ -171,12 +172,11 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) { // number of dealers in the protocol var dealers int - /*if dkg == jointFeldman { + if dkg == jointFeldman { dealers = n } else { dealers = 1 - }*/ - dealers = 1 + } // create n processors for all participants processors := make([]testDKGProcessor, 0, n) @@ -348,8 +348,8 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) { assert.Equal(t, expected, processors[i].disqualified) } // check if DKG is successful - if false { //(dkg == jointFeldman && (r1 > threshold || (n-r1) <= threshold)) || - //(dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer + if (dkg == jointFeldman && (r1 > threshold || (n-r1) <= threshold)) || + (dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer t.Logf("dkg failed, there are %d disqualified participants\n", r1) // DKG failed, check for final errors for i := r1; i < n; i++ { From b113d3362152a2eaf1e97dda025835c32967e0f6 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 3 Mar 2023 13:53:22 -0600 Subject: [PATCH 016/200] uncomment tests --- crypto/bls_multisig.go | 20 +++----------------- crypto/bls_test.go | 16 +++++++--------- crypto/blst_include.h | 2 +- crypto/dkg_feldmanvss.go | 8 ++++---- crypto/dkg_feldmanvssq.go | 2 +- crypto/dkg_test.go | 5 ++--- 6 files changed, 18 insertions(+), 35 deletions(-) diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index 297e61267d9..b4fa5918ef7 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -5,11 +5,13 @@ package crypto import ( "errors" + "fmt" _ "errors" _ "fmt" + "github.com/onflow/flow-go/crypto/hash" _ "github.com/onflow/flow-go/crypto/hash" ) @@ -41,7 +43,6 @@ import "C" // used for signatures. var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite) -/* // BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key. // // The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used @@ -97,8 +98,6 @@ func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) { // - (aggregated_signature, nil) otherwise func AggregateBLSSignatures(sigs []Signature) (Signature, error) { - - // check for empty list if len(sigs) == 0 { return nil, blsAggregateEmptyListError @@ -144,8 +143,6 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) { // - (aggregated_key, nil) otherwise func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { - - // check for empty list if len(keys) == 0 { return nil, blsAggregateEmptyListError @@ -181,8 +178,6 @@ func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { // - (aggregated_key, nil) otherwise func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { - - // check for empty list if len(keys) == 0 { return nil, blsAggregateEmptyListError @@ -203,7 +198,7 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { sumKey := newPubKeyBLSBLS12381(&sum) return sumKey, nil -}*/ +} // IdentityBLSPublicKey returns an identity public key which corresponds to the point // at infinity in G2 (identity element of G2). @@ -217,8 +212,6 @@ func IdentityBLSPublicKey() PublicKey { return &identity } -/* - // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key. // // The common use case assumes the aggregated public key was initially formed using @@ -235,8 +228,6 @@ func IdentityBLSPublicKey() PublicKey { // - (remaining_key, nil) otherwise func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) { - - aggPKBLS, ok := aggKey.(*pubKeyBLSBLS12381) if !ok { return nil, notBLSKeyError @@ -335,8 +326,6 @@ func VerifyBLSSignatureManyMessages( pks []PublicKey, s Signature, messages [][]byte, kmac []hash.Hasher, ) (bool, error) { - - // check signature length if len(s) != signatureLengthBLSBLS12381 { return false, nil @@ -484,8 +473,6 @@ func BatchVerifyBLSSignaturesOneMessage( pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher, ) ([]bool, error) { - - // empty list check if len(pks) == 0 { return []bool{}, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError) @@ -549,7 +536,6 @@ func BatchVerifyBLSSignaturesOneMessage( return verifBool, nil } -*/ // blsAggregateEmptyListError is returned when a list of BLS objects (e.g. signatures or keys) // is empty or nil and thereby represents an invalid input. diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 579700a183e..8aec95a8b03 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -8,7 +8,9 @@ import ( "encoding/hex" "fmt" _ "math/rand" + mrand "math/rand" "testing" + "time" _ "time" "github.com/stretchr/testify/assert" @@ -130,7 +132,7 @@ func TestBLSBLS12381Hasher(t *testing.T) { assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16) }) - /*t.Run("orthogonal PoP and signature hashing", func(t *testing.T) { + t.Run("orthogonal PoP and signature hashing", func(t *testing.T) { data := []byte("random_data") // empty tag hasher sigKmac := NewExpandMsgXOFKMAC128("") @@ -139,7 +141,7 @@ func TestBLSBLS12381Hasher(t *testing.T) { // PoP hasher h2 := popKMAC.ComputeHash(data) assert.NotEqual(t, h1, h2) - })*/ + }) } @@ -215,7 +217,7 @@ func TestBLSUtils(t *testing.T) { } // BLS Proof of Possession test -/*func TestBLSPOP(t *testing.T) { +func TestBLSPOP(t *testing.T) { r := time.Now().UnixNano() mrand.Seed(r) t.Logf("math rand seed is %d", r) @@ -267,8 +269,6 @@ func TestBLSUtils(t *testing.T) { }) } - - // BLS multi-signature // signature aggregation sanity check // @@ -935,7 +935,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { assert.False(t, valid, "verification should fail with nil hasher") inputPks[0] = tmpPK }) -}*/ +} // TestBLSErrorTypes verifies working of error-type-detecting functions // such as `IsInvalidInputsError`. @@ -963,7 +963,6 @@ func TestBLSErrorTypes(t *testing.T) { }) } -/* // VerifyBLSSignatureManyMessages bench // Bench the slowest case where all messages and public keys are distinct. // (2*n) pairings without aggrgetion Vs (n+1) pairings with aggregation. @@ -1059,7 +1058,6 @@ func BenchmarkAggregate(b *testing.B) { }) } - func TestBLSIdentity(t *testing.T) { r := time.Now().UnixNano() mrand.Seed(r) @@ -1112,4 +1110,4 @@ func TestBLSIdentity(t *testing.T) { assert.NoError(t, err) assert.False(t, valid) }) -}*/ +} diff --git a/crypto/blst_include.h b/crypto/blst_include.h index d7a7cdf4367..77c06a9e5e5 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -24,4 +24,4 @@ typedef POINTonE1 G1; // Subroup G1 in E2 typedef POINTonE2 G2; -#endif \ No newline at end of file +#endif diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index e80dc8d71e6..d27f68ee45c 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -285,7 +285,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error { // the dealer's own share if i-1 == s.myIndex { xdata := make([]byte, shareSize) - zrPolynomialImage(xdata, s.a, i, &s.y[i-1]) + frPolynomialImage(xdata, s.a, i, &s.y[i-1]) err := readScalarFrStar(&s.x, xdata) if err != nil { return fmt.Errorf("unexpected error when generating the dealer's own share: %w", err) @@ -295,7 +295,7 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error { // the-other-participant shares data := make([]byte, shareSize+1) data[0] = byte(feldmanVSSShare) - zrPolynomialImage(data[1:], s.a, i, &s.y[i-1]) + frPolynomialImage(data[1:], s.a, i, &s.y[i-1]) s.processor.PrivateSend(int(i-1), data) } // broadcast the vector @@ -401,11 +401,11 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { } } -// zrPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Z/Fr +// frPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Fr[X] // r being the order of G1 // P(x) is written in dest, while g2^P(x) is written in y // x being a small integer -func zrPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) { +func frPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) { C.Fr_polynomialImage_export((*C.uchar)(&dest[0]), (*C.ep2_st)(y), (*C.Fr)(&a[0]), (C.int)(len(a)), diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index cc0b94962df..ff9dad35879 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -495,7 +495,7 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index) data := make([]byte, complaintAnswerSize+1) data[0] = byte(feldmanVSSComplaintAnswer) data[1] = byte(complainee) - zrPolynomialImage(data[2:], s.a, complainee+1, nil) + frPolynomialImage(data[2:], s.a, complainee+1, nil) s.complaints[complainee].answerReceived = true s.processor.Broadcast(data) } diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index 0d32a3fd1ec..da0e05782a0 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -20,7 +20,7 @@ var gt *testing.T func TestDKG(t *testing.T) { t.Run("FeldmanVSSSimple", testFeldmanVSSSimple) t.Run("FeldmanVSSQual", testFeldmanVSSQual) - //t.Run("JointFeldman", testJointFeldman) + t.Run("JointFeldman", testJointFeldman) } // optimal threshold (t) to allow the largest number of malicious participants (m) @@ -96,7 +96,6 @@ func testFeldmanVSSQual(t *testing.T) { // are only tested within joint feldman. } -/* // Testing JointFeldman by simulating a network of n participants func testJointFeldman(t *testing.T) { log.SetLevel(log.ErrorLevel) @@ -134,7 +133,7 @@ func testJointFeldman(t *testing.T) { dkgCommonTest(t, jointFeldman, n, threshold, duplicatedMessages) }) } -*/ + // Supported Key Generation protocols const ( feldmanVSS = iota From 7a2617c56cf34d46e4875976952d771960d5e74e Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 3 Mar 2023 14:44:20 -0600 Subject: [PATCH 017/200] renaming and linter errors --- crypto/bls12381_utils.go | 9 +++------ crypto/bls12381_utils_test.go | 2 +- crypto/bls_thresholdsign.go | 12 +++++++++--- crypto/dkg_feldmanvss.go | 12 +++++++++--- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index cbdc718e364..e2ee855e081 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -122,7 +122,7 @@ func (p *pointG2) isInfinity() bool { } // returns a random element of Fr in input pointer -func randZr(x *scalar) error { +func randFr(x *scalar) error { bytes := make([]byte, frBytesLen+securityBits/8) _, err := rand.Read(bytes) // checking one output is enough if err != nil { @@ -133,7 +133,7 @@ func randZr(x *scalar) error { } // writes a random element of Fr* in input pointer -func randZrStar(x *scalar) error { +func randFrStar(x *scalar) error { bytes := make([]byte, frBytesLen+securityBits/8) isZero := true for isZero { @@ -153,10 +153,7 @@ func mapToZr(x *scalar, src []byte) bool { isZero := C.map_bytes_to_Fr((*C.Fr)(x), (*C.uchar)(&src[0]), (C.int)(len(src))) - if isZero { - return true - } - return false + return bool(isZero) } // writeScalar writes a scalar in a slice of bytes diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 4662aa9567f..e7dba41a8eb 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -19,7 +19,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) { _, _ = rand.Read(seed) _ = seedRelic(seed) var expo scalar - randZr(&expo) + _ = randFr(&expo) // G1 generator multiplication b.Run("G1 gen", func(b *testing.B) { diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index ef4630a7341..e6c21004193 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -564,12 +564,18 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, } // Generate a polynomial P in Fr[X] of degree t a := make([]scalar, threshold+1) - randZrStar(&a[0]) // non-identity key + if err := randFrStar(&a[0]); err != nil { // non-identity key + return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err) + } if threshold > 0 { for i := 1; i < threshold; i++ { - randZr(&a[i]) + if err := randFr(&a[i]); err != nil { + return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err) + } + } + if err := randFrStar(&a[threshold]); err != nil { // enforce the polynomial degree + return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err) } - randZrStar(&a[threshold]) // enforce the polynomial degree } // compute the shares for i := index(1); int(i) <= size; i++ { diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index d27f68ee45c..5db62e8672c 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -268,15 +268,21 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error { s.vA = make([]pointG2, s.threshold+1) s.y = make([]pointG2, s.size) // non-zero a[0] - group private key is not zero - randZrStar(&s.a[0]) + if err := randFrStar(&s.a[0]); err != nil { + return fmt.Errorf("generating the polynomial failed: %w", err) + } generatorScalarMultG2(&s.vA[0], &s.a[0]) if s.threshold > 0 { for i := 1; i < s.threshold; i++ { - randZr(&s.a[i]) + if err := randFr(&s.a[i]); err != nil { + return fmt.Errorf("generating the polynomial failed: %w", err) + } generatorScalarMultG2(&s.vA[i], &s.a[i]) } // non-zero a[t] to enforce the polynomial degree - randZrStar(&s.a[s.threshold]) + if err := randFrStar(&s.a[s.threshold]); err != nil { + return fmt.Errorf("generating the polynomial failed: %w", err) + } generatorScalarMultG2(&s.vA[s.threshold], &s.a[s.threshold]) } From d7f3d5d5a531a773eadc2ceffc06145b6244c0be Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 3 Mar 2023 17:29:50 -0600 Subject: [PATCH 018/200] fix gcc compilation issue and remove blst.h --- crypto/bls12381_utils.c | 2 +- crypto/bls12381_utils.h | 2 +- crypto/bls_thresholdsign_core.c | 5 +- crypto/blst_include.h | 56 +++- crypto/blst_src/blst.h | 483 -------------------------------- crypto/blst_src/blst_aux.h | 102 ------- 6 files changed, 59 insertions(+), 591 deletions(-) delete mode 100644 crypto/blst_src/blst.h delete mode 100644 crypto/blst_src/blst_aux.h diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index dc8b642de66..45811478429 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -25,7 +25,7 @@ int get_Fr_BYTES() { // Fr utilities // Montgomery constant R related to the curve order r -const Fr BLS12_381_rR = (Fr){ /* (1<<256)%r */ +const limb_t BLS12_381_rR[Fr_LIMBS] = { /* (1<<256)%r */ TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe) }; diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 4939e15135e..9a874f6e9d3 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -92,7 +92,7 @@ int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*); void map_to_G1(ep_t, const byte*, const int); // Fr utilities -extern const Fr BLS12_381_rR; +extern const limb_t BLS12_381_rR[Fr_LIMBS]; bool_t Fr_is_zero(const Fr* a); bool_t Fr_is_equal(const Fr* a, const Fr* b); void Fr_set_limb(Fr*, const limb_t); diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index e6f94716d9b..75542763f6a 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -16,9 +16,8 @@ static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t ind Fr denominator; // eventually would represent D*R^k // Initialize N and D to Montgomery constant R - // TODO: hardcode R - Fr_copy(&numerator, &BLS12_381_rR); - Fr_copy(&denominator, &BLS12_381_rR); + Fr_copy(&numerator, (Fr*)BLS12_381_rR); + Fr_copy(&denominator, (Fr*)BLS12_381_rR); // sign of D: 0 for positive and 1 for negative int sign = 0; diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 77c06a9e5e5..7af94ea3b17 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -6,13 +6,67 @@ // extra tools to use BLST low level that are needed by the Flow crypto library // eventually this file would replace blst.h -#include "blst.h" // TODO: should be deleted +//#include "blst.h" // TODO: should be deleted #include "point.h" #include "consts.h" // types used by the Flow crypto library that are imported from BLST // these type definitions are used as an abstraction from BLST internal types +// Parts of this file have been copied from blst.h in the BLST repo +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifdef __SIZE_TYPE__ +typedef __SIZE_TYPE__ size_t; +#else +#include +#endif + +#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ + && defined(__UINT64_TYPE__) +typedef __UINT8_TYPE__ uint8_t; +typedef __UINT32_TYPE__ uint32_t; +typedef __UINT64_TYPE__ uint64_t; +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#elif defined(__BLST_CGO__) +typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ +#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 +# define bool _Bool +#else +# define bool int +#endif + +#ifdef SWIG +# define DEFNULL =NULL +#elif defined __cplusplus +# define DEFNULL =0 +#else +# define DEFNULL +#endif + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, + BLST_BAD_SCALAR, +} BLST_ERROR; + +typedef uint8_t byte; +typedef uint64_t limb_t; + // field elements F_r // where `r` is the order of G1/G2. // F_r elements are represented as big numbers reduced modulo `r`. Big numbers diff --git a/crypto/blst_src/blst.h b/crypto/blst_src/blst.h deleted file mode 100644 index 24213ded2c5..00000000000 --- a/crypto/blst_src/blst.h +++ /dev/null @@ -1,483 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLST_H__ -#define __BLST_H__ - -#ifdef __SIZE_TYPE__ -typedef __SIZE_TYPE__ size_t; -#else -#include -#endif - -#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ - && defined(__UINT64_TYPE__) -typedef __UINT8_TYPE__ uint8_t; -typedef __UINT32_TYPE__ uint32_t; -typedef __UINT64_TYPE__ uint64_t; -#else -#include -#endif - -#ifdef __cplusplus -extern "C" { -#elif defined(__BLST_CGO__) -typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ -#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 -# define bool _Bool -#else -# define bool int -#endif - -#ifdef SWIG -# define DEFNULL =NULL -#elif defined __cplusplus -# define DEFNULL =0 -#else -# define DEFNULL -#endif - -typedef enum { - BLST_SUCCESS = 0, - BLST_BAD_ENCODING, - BLST_POINT_NOT_ON_CURVE, - BLST_POINT_NOT_IN_GROUP, - BLST_AGGR_TYPE_MISMATCH, - BLST_VERIFY_FAIL, - BLST_PK_IS_INFINITY, - BLST_BAD_SCALAR, -} BLST_ERROR; - -typedef uint8_t byte; -typedef uint64_t limb_t; - -typedef struct { byte b[256/8]; } blst_scalar; -typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; -typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; -/* 0 is "real" part, 1 is "imaginary" */ -typedef struct { blst_fp fp[2]; } blst_fp2; -typedef struct { blst_fp2 fp2[3]; } blst_fp6; -typedef struct { blst_fp6 fp6[2]; } blst_fp12; - -void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); -void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); -void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); -void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); -void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); -void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); -void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); -void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); -bool blst_scalar_fr_check(const blst_scalar *a); -bool blst_sk_check(const blst_scalar *a); -bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, - const blst_scalar *b); -bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, - const blst_scalar *b); -bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, - const blst_scalar *b); -void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); -bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); -bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); - -#ifndef SWIG -/* - * BLS12-381-specifc Fr operations. - */ -void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); -void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); -void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); -void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); -void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); -void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); -void blst_fr_sqr(blst_fr *ret, const blst_fr *a); -void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); -void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); -void blst_fr_inverse(blst_fr *ret, const blst_fr *a); -#ifdef BLST_FR_PENTAROOT -void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); -void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); -#endif - -void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); -void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); -void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); -void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); - -/* - * BLS12-381-specifc Fp operations. - */ -void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); -void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); -void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); -void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); -void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); -void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); -void blst_fp_sqr(blst_fp *ret, const blst_fp *a); -void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); -void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); -void blst_fp_inverse(blst_fp *ret, const blst_fp *a); -bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); - -void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); -void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); -void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); -void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); -void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); -void blst_bendian_from_fp(byte ret[48], const blst_fp *a); -void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); -void blst_lendian_from_fp(byte ret[48], const blst_fp *a); - -/* - * BLS12-381-specifc Fp2 operations. - */ -void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); -void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); -void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); -void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); -void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); -void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); -void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); -void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); -void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); -void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); -bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); - -/* - * BLS12-381-specifc Fp12 operations. - */ -void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); -void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); -void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); -void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, - const blst_fp6 *xy00z0); -void blst_fp12_conjugate(blst_fp12 *a); -void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); -/* caveat lector! |n| has to be non-zero and not more than 3! */ -void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); -bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); -bool blst_fp12_is_one(const blst_fp12 *a); -bool blst_fp12_in_group(const blst_fp12 *a); -const blst_fp12 *blst_fp12_one(); -#endif // SWIG - -/* - * BLS12-381-specifc point operations. - */ -typedef struct { blst_fp x, y, z; } blst_p1; -typedef struct { blst_fp x, y; } blst_p1_affine; - -void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); -void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); -void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, - const blst_p1_affine *b); -void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, - const blst_p1_affine *b); -void blst_p1_double(blst_p1 *out, const blst_p1 *a); -void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, - size_t nbits); -void blst_p1_cneg(blst_p1 *p, bool cbit); -void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); -void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); -bool blst_p1_on_curve(const blst_p1 *p); -bool blst_p1_in_g1(const blst_p1 *p); -bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); -bool blst_p1_is_inf(const blst_p1 *a); -const blst_p1 *blst_p1_generator(); - -bool blst_p1_affine_on_curve(const blst_p1_affine *p); -bool blst_p1_affine_in_g1(const blst_p1_affine *p); -bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); -bool blst_p1_affine_is_inf(const blst_p1_affine *a); -const blst_p1_affine *blst_p1_affine_generator(); - -typedef struct { blst_fp2 x, y, z; } blst_p2; -typedef struct { blst_fp2 x, y; } blst_p2_affine; - -void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); -void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); -void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, - const blst_p2_affine *b); -void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, - const blst_p2_affine *b); -void blst_p2_double(blst_p2 *out, const blst_p2 *a); -void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, - size_t nbits); -void blst_p2_cneg(blst_p2 *p, bool cbit); -void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); -void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); -bool blst_p2_on_curve(const blst_p2 *p); -bool blst_p2_in_g2(const blst_p2 *p); -bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); -bool blst_p2_is_inf(const blst_p2 *a); -const blst_p2 *blst_p2_generator(); - -bool blst_p2_affine_on_curve(const blst_p2_affine *p); -bool blst_p2_affine_in_g2(const blst_p2_affine *p); -bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); -bool blst_p2_affine_is_inf(const blst_p2_affine *a); -const blst_p2_affine *blst_p2_affine_generator(); - -/* - * Multi-scalar multiplications and other multi-point operations. - */ - -void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], - size_t npoints); -void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], - size_t npoints); - -size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); -void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, - const blst_p1_affine *const points[], - size_t npoints); -size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); -void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], - size_t wbits, size_t npoints, - const byte *const scalars[], size_t nbits, - limb_t *scratch); - -size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); -void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], - size_t npoints, const byte *const scalars[], - size_t nbits, limb_t *scratch); -void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], - size_t npoints, const byte *const scalars[], - size_t nbits, limb_t *scratch, - size_t bit0, size_t window); - -void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], - size_t npoints); -void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], - size_t npoints); - -size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); -void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, - const blst_p2_affine *const points[], - size_t npoints); -size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); -void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], - size_t wbits, size_t npoints, - const byte *const scalars[], size_t nbits, - limb_t *scratch); - -size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); -void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], - size_t npoints, const byte *const scalars[], - size_t nbits, limb_t *scratch); -void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], - size_t npoints, const byte *const scalars[], - size_t nbits, limb_t *scratch, - size_t bit0, size_t window); - -/* - * Hash-to-curve operations. - */ -#ifndef SWIG -void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); -void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); -#endif - -void blst_encode_to_g1(blst_p1 *out, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, size_t DST_len DEFNULL, - const byte *aug DEFNULL, size_t aug_len DEFNULL); -void blst_hash_to_g1(blst_p1 *out, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, size_t DST_len DEFNULL, - const byte *aug DEFNULL, size_t aug_len DEFNULL); - -void blst_encode_to_g2(blst_p2 *out, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, size_t DST_len DEFNULL, - const byte *aug DEFNULL, size_t aug_len DEFNULL); -void blst_hash_to_g2(blst_p2 *out, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, size_t DST_len DEFNULL, - const byte *aug DEFNULL, size_t aug_len DEFNULL); - -/* - * Zcash-compatible serialization/deserialization. - */ -void blst_p1_serialize(byte out[96], const blst_p1 *in); -void blst_p1_compress(byte out[48], const blst_p1 *in); -void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); -void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); -BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); -BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); - -void blst_p2_serialize(byte out[192], const blst_p2 *in); -void blst_p2_compress(byte out[96], const blst_p2 *in); -void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); -void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); -BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); -BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); - -/* - * Specification defines two variants, 'minimal-signature-size' and - * 'minimal-pubkey-size'. To unify appearance we choose to distinguish - * them by suffix referring to the public key type, more specifically - * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to - * 'minimal-signature-size'. It might appear a bit counterintuitive - * in sign call, but no matter how you twist it, something is bound to - * turn a little odd. - */ -/* - * Secret-key operations. - */ -void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, - const byte *info DEFNULL, size_t info_len DEFNULL); -void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); -void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, - const blst_scalar *SK); -void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); -void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, - const blst_scalar *SK); - -/* - * Pairing interface. - */ -#ifndef SWIG -void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, - const blst_p1_affine *P); -void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); -void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); -void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], - const blst_p1_affine *P); -bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); -#endif - -#ifdef __BLST_CGO__ -typedef limb_t blst_pairing; -#elif defined(__BLST_RUST_BINDGEN__) -typedef struct {} blst_pairing; -#else -typedef struct blst_opaque blst_pairing; -#endif - -size_t blst_pairing_sizeof(); -void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, - const byte *DST DEFNULL, size_t DST_len DEFNULL); -const byte *blst_pairing_get_dst(const blst_pairing *ctx); -void blst_pairing_commit(blst_pairing *ctx); -BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, - const blst_p2_affine *PK, - const blst_p1_affine *signature, - const byte *msg, size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, - const blst_p2_affine *PK, - bool pk_grpchk, - const blst_p1_affine *signature, - bool sig_grpchk, - const byte *msg, size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, - const blst_p2_affine *PK, - const blst_p1_affine *sig, - const byte *scalar, - size_t nbits, - const byte *msg, - size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, - const blst_p2_affine *PK, - bool pk_grpchk, - const blst_p1_affine *sig, - bool sig_grpchk, - const byte *scalar, - size_t nbits, - const byte *msg, - size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, - const blst_p1_affine *PK, - const blst_p2_affine *signature, - const byte *msg, size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, - const blst_p1_affine *PK, - bool pk_grpchk, - const blst_p2_affine *signature, - bool sig_grpchk, - const byte *msg, size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, - const blst_p1_affine *PK, - const blst_p2_affine *sig, - const byte *scalar, - size_t nbits, - const byte *msg, - size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, - const blst_p1_affine *PK, - bool pk_grpchk, - const blst_p2_affine *sig, - bool sig_grpchk, - const byte *scalar, - size_t nbits, - const byte *msg, - size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); -bool blst_pairing_finalverify(const blst_pairing *ctx, - const blst_fp12 *gtsig DEFNULL); - - -/* - * Customarily applications aggregate signatures separately. - * In which case application would have to pass NULLs for |signature| - * to blst_pairing_aggregate calls and pass aggregated signature - * collected with these calls to blst_pairing_finalverify. Inputs are - * Zcash-compatible "straight-from-wire" byte vectors, compressed or - * not. - */ -BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, - const byte *zwire); -BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, - const byte *zwire); - -void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); -void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); - -/* - * "One-shot" CoreVerify entry points. - */ -BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, - const blst_p2_affine *signature, - bool hash_or_encode, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, - size_t DST_len DEFNULL, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, - const blst_p1_affine *signature, - bool hash_or_encode, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, - size_t DST_len DEFNULL, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); - -extern const blst_p1_affine BLS12_381_G1; -extern const blst_p1_affine BLS12_381_NEG_G1; -extern const blst_p2_affine BLS12_381_G2; -extern const blst_p2_affine BLS12_381_NEG_G2; - -#include "blst_aux.h" - -#ifdef __cplusplus -} -#endif -#endif diff --git a/crypto/blst_src/blst_aux.h b/crypto/blst_src/blst_aux.h deleted file mode 100644 index 6d444fc1729..00000000000 --- a/crypto/blst_src/blst_aux.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLST_AUX_H__ -#define __BLST_AUX_H__ -/* - * This file lists interfaces that might be promoted to blst.h or removed, - * depending on their proven/unproven worthiness. - */ - -void blst_fr_to(blst_fr *ret, const blst_fr *a); -void blst_fr_from(blst_fr *ret, const blst_fr *a); - -void blst_fp_to(blst_fp *ret, const blst_fp *a); -void blst_fp_from(blst_fp *ret, const blst_fp *a); - -bool blst_fp_is_square(const blst_fp *a); -bool blst_fp2_is_square(const blst_fp2 *a); - -void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); -void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); - -/* - * Below functions produce both point and deserialized outcome of - * SkToPk and Sign. However, deserialized outputs are pre-decorated - * with sign and infinity bits. This means that you have to bring the - * output into compliance prior returning to application. If you want - * compressed point value, then do [equivalent of] - * - * byte temp[96]; - * blst_sk_to_pk2_in_g1(temp, out_pk, SK); - * temp[0] |= 0x80; - * memcpy(out, temp, 48); - * - * Otherwise do - * - * blst_sk_to_pk2_in_g1(out, out_pk, SK); - * out[0] &= ~0x20; - * - * Either |out| or |out_| can be NULL. - */ -void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, - const blst_scalar *SK); -void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, - const blst_p2 *hash, const blst_scalar *SK); -void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, - const blst_scalar *SK); -void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, - const blst_p1 *hash, const blst_scalar *SK); - -typedef struct {} blst_uniq; - -size_t blst_uniq_sizeof(size_t n_nodes); -void blst_uniq_init(blst_uniq *tree); -bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); - -#ifdef expand_message_xmd -void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, - const unsigned char *aug, size_t aug_len, - const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len); -#else -void blst_expand_message_xmd(byte *out, size_t out_len, - const byte *msg, size_t msg_len, - const byte *DST, size_t DST_len); -#endif - -void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, - size_t nbits); -void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, - size_t nbits); - -void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, - const blst_p1_affine *p); -blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); -void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a); - -void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, - const byte *info DEFNULL, size_t info_len DEFNULL); -void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, - const byte *salt, size_t salt_len, - const byte *info DEFNULL, size_t info_len DEFNULL); -void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, - const byte *salt, size_t salt_len, - const byte *info DEFNULL, size_t info_len DEFNULL); -void blst_derive_master_eip2333(blst_scalar *out_SK, - const byte *IKM, size_t IKM_len); -void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK, - uint32_t child_index); - -void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex); -void blst_fr_from_hexascii(blst_fr *ret, const byte *hex); -void blst_fp_from_hexascii(blst_fp *ret, const byte *hex); - -size_t blst_p1_sizeof(); -size_t blst_p1_affine_sizeof(); -size_t blst_p2_sizeof(); -size_t blst_p2_affine_sizeof(); -size_t blst_fp12_sizeof(); -#endif From 4979760110e2f23e82fbd6437677b9e6a1c1c4fa Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 3 Mar 2023 17:44:09 -0600 Subject: [PATCH 019/200] fix double definition --- crypto/blst_include.h | 1 - 1 file changed, 1 deletion(-) diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 7af94ea3b17..9052964f361 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -65,7 +65,6 @@ typedef enum { } BLST_ERROR; typedef uint8_t byte; -typedef uint64_t limb_t; // field elements F_r // where `r` is the order of G1/G2. From 90c412fd01d765ce006eeb350ad1e64400b8ab8e Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sun, 5 Mar 2023 20:54:50 -0600 Subject: [PATCH 020/200] enable tmate on ci temporarily --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9b977950c97..e360717d043 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -115,6 +115,8 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 - name: Run tests (${{ matrix.targets.name }}) if: github.actor != 'bors[bot]' uses: nick-fields/retry@v2 From d96d6fa6b9b7354a72e55eaf7e16e2cacc0a1931 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sun, 5 Mar 2023 23:47:33 -0600 Subject: [PATCH 021/200] test improvement and temporary memory free --- crypto/bls12381_utils.c | 9 ++++----- crypto/bls_test.go | 22 +++++++++++----------- crypto/dkg_core.c | 2 ++ 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 45811478429..10531c09602 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -30,11 +30,6 @@ const limb_t BLS12_381_rR[Fr_LIMBS] = { /* (1<<256)%r */ TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe) }; -/*0x1824b159acc5056f -0x998c4fefecbc4ff5 -0x5884b7fa00034802 -0x00000001fffffffe*/ - // TODO: temp utility function to delete bn_st* Fr_blst_to_relic(const Fr* x) { bn_st* out = (bn_st*)malloc(sizeof(bn_st)); @@ -366,6 +361,7 @@ void ep_mult(ep_t res, const ep_t p, const Fr *expo) { bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using window NAF of size 2 ep_mul_lwnaf(res, p, tmp_expo); + free(tmp_expo); } // Exponentiation of generator g1 in G1 @@ -374,6 +370,7 @@ void ep_mult_gen_bench(ep_t res, const Fr* expo) { bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using precomputed table of size 4 ep_mul_gen(res, tmp_expo); + free(tmp_expo); } void ep_mult_generic_bench(ep_t res, const Fr* expo) { @@ -386,6 +383,7 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) { bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using window NAF of size 2 ep2_mul_lwnaf(res, p, tmp_expo); + free(tmp_expo); } // Exponentiation of generator g2 in G2 @@ -393,6 +391,7 @@ void ep2_mult_gen(ep2_t res, const Fr* expo) { bn_st* tmp_expo = Fr_blst_to_relic(expo); // Using precomputed table of size 4 g2_mul_gen(res, tmp_expo); + free(tmp_expo); } // DEBUG printing functions diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 8aec95a8b03..a9672b8eeb7 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -7,11 +7,9 @@ import ( "crypto/rand" "encoding/hex" "fmt" - _ "math/rand" mrand "math/rand" "testing" "time" - _ "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -69,7 +67,7 @@ func BenchmarkBLSBLS12381Verify(b *testing.B) { // utility function to generate a random BLS private key func randomSK(t *testing.T, seed []byte) PrivateKey { - n, err := rand.Read(seed) + n, err := mrand.Read(seed) require.Equal(t, n, KeyGenSeedMinLen) require.NoError(t, err) sk, err := GeneratePrivateKey(BLSBLS12381, seed) @@ -270,23 +268,23 @@ func TestBLSPOP(t *testing.T) { } // BLS multi-signature -// signature aggregation sanity check +// signature aggregation with the same message sanity check // // Aggregate n signatures of the same message under different keys, and compare // it against the signature of the message under an aggregated private key. // Verify the aggregated signature using the multi-signature verification with // one message. -func TestBLSAggregateSignatures(t *testing.T) { +func TestBLSAggregateSignaturesSameMessage(t *testing.T) { + r := time.Now().UnixNano() + mrand.Seed(r) + t.Logf("math rand seed is %d", r) // random message input := make([]byte, 100) - _, err := rand.Read(input) + _, err := mrand.Read(input) require.NoError(t, err) // hasher kmac := NewExpandMsgXOFKMAC128("test tag") // number of signatures to aggregate - r := time.Now().UnixNano() - mrand.Seed(r) - t.Logf("math rand seed is %d", r) sigsNum := mrand.Intn(100) + 1 sigs := make([]Signature, 0, sigsNum) sks := make([]PrivateKey, 0, sigsNum) @@ -330,19 +328,21 @@ func TestBLSAggregateSignatures(t *testing.T) { t.Run("one invalid signature", func(t *testing.T) { input[0] ^= 1 randomIndex := mrand.Intn(sigsNum) - sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) + sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // sign a different message input[0] ^= 1 aggSig, err = AggregateBLSSignatures(sigs) require.NoError(t, err) + // First check: check the signatures are not equal assert.NotEqual(t, aggSig, expectedSig, "signature %s shouldn't be %s private keys are %s, input is %x", aggSig, expectedSig, sks, input) + // Second check: multi-verification should fail valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) require.NoError(t, err) assert.False(t, valid, "verification of signature %s should fail, it shouldn't be %s private keys are %s, input is %x", aggSig, expectedSig, sks, input) - sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) + sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // rebuild the correct signature require.NoError(t, err) }) diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 34d6addbffb..9ca0e7a821e 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -36,6 +36,7 @@ void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const if (y) { bn_st* tmp = Fr_blst_to_relic(image); g2_mul_gen(y, tmp); + free(tmp); } } @@ -102,6 +103,7 @@ int verifyshare(const Fr* x, const ep2_t y) { ep2_new(res); bn_st* x_tmp = Fr_blst_to_relic(x); g2_mul_gen(res, x_tmp); + free(x_tmp); return (ep2_cmp(res, (ep2_st*)y) == RLC_EQ); } From 0593da596b5e67e61a2907af93fd34678f993ffb Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 6 Mar 2023 08:46:20 -0600 Subject: [PATCH 022/200] fix memory allocation bug in temp function --- crypto/bls12381_utils.c | 1 + crypto/bls_thresholdsign_test.go | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 10531c09602..c4f2d10b632 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -35,6 +35,7 @@ bn_st* Fr_blst_to_relic(const Fr* x) { bn_st* out = (bn_st*)malloc(sizeof(bn_st)); byte* data = (byte*)malloc(Fr_BYTES); be_bytes_from_limbs(data, (limb_t*)x, Fr_BYTES); + out->alloc = RLC_DV_DIGS; bn_read_bin(out, data, Fr_BYTES); free(data); return out; diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 5473b454827..0d7f7204a79 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -31,10 +31,12 @@ var thresholdSignatureMessage = []byte("random message") // centralized test of the stateful threshold signature using the threshold key generation. func testCentralizedStatefulAPI(t *testing.T) { + r := time.Now().UnixNano() + mrand.Seed(r) + t.Log(r) n := 10 for threshold := MinimumThreshold; threshold < n; threshold++ { // generate threshold keys - mrand.Seed(time.Now().UnixNano()) seed := make([]byte, SeedMinLenDKG) _, err := mrand.Read(seed) require.NoError(t, err) From 0e8829e7c8194fd89b344318dcb9f0080b486b97 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 6 Mar 2023 09:22:06 -0600 Subject: [PATCH 023/200] Revert "enable tmate on ci temporarily" This reverts commit 90c412fd01d765ce006eeb350ad1e64400b8ab8e. --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e360717d043..9b977950c97 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -115,8 +115,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - name: Run tests (${{ matrix.targets.name }}) if: github.actor != 'bors[bot]' uses: nick-fields/retry@v2 From 52cae3e0af4635ad17ed8edd4a86464db9ede606 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 8 Mar 2023 16:30:28 -0600 Subject: [PATCH 024/200] g1 and g2 exportable types to cgo --- crypto/bls12381_utils.c | 13 ++++++++----- crypto/bls12381_utils.go | 4 ++-- crypto/bls12381_utils.h | 2 +- crypto/blst_include.h | 40 ++++++++++++++++++++++++++++++++-------- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index c4f2d10b632..f1a93173971 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -182,6 +182,10 @@ static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES]) } } +static void pow256_from_Fr(pow256 ret, const Fr* in) { + le_bytes_from_limbs(ret, (limb_t*)in, Fr_BYTES); +} + // reads a scalar in `a` and checks it is a valid Fr element (a < r). // input bytes are big endian. // returns: @@ -388,11 +392,10 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) { } // Exponentiation of generator g2 in G2 -void ep2_mult_gen(ep2_t res, const Fr* expo) { - bn_st* tmp_expo = Fr_blst_to_relic(expo); - // Using precomputed table of size 4 - g2_mul_gen(res, tmp_expo); - free(tmp_expo); +void G2_mult_gen(G2* res, const Fr* expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE2_sign(res, &BLS12_381_G2, tmp); } // DEBUG printing functions diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index e2ee855e081..6f093d57812 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -21,7 +21,7 @@ import ( // Go wrappers around BLST C types // Go wrappers around Relic C types type pointG1 C.ep_st -type pointG2 C.ep2_st +type pointG2 C.G2 type scalar C.Fr // BLS12-381 related lengths @@ -96,7 +96,7 @@ func genericScalarMultG1(res *pointG1, expo *scalar) { // Exponentiation of g2 in G2 func generatorScalarMultG2(res *pointG2, expo *scalar) { - C.ep2_mult_gen((*C.ep2_st)(res), (*C.Fr)(expo)) + C.G2_mult_gen((*C.G2)(res), (*C.Fr)(expo)) } // comparison in Fr where r is the group order of G1/G2 diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 9a874f6e9d3..e0c5fed472c 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -131,7 +131,7 @@ void ep2_write_bin_compact(byte *, const ep2_t, const int); void ep_mult_gen_bench(ep_t, const Fr*); void ep_mult_generic_bench(ep_t, const Fr*); void ep_mult(ep_t, const ep_t, const Fr*); -void ep2_mult_gen(ep2_t, const Fr*); +void G2_mult_gen(ep2_t, const Fr*); void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); void ep_sum_vector(ep_t, ep_st*, const int); diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 9052964f361..0ee8e99ddb2 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -6,9 +6,9 @@ // extra tools to use BLST low level that are needed by the Flow crypto library // eventually this file would replace blst.h -//#include "blst.h" // TODO: should be deleted #include "point.h" #include "consts.h" +#include "bls12381_utils.h" // types used by the Flow crypto library that are imported from BLST // these type definitions are used as an abstraction from BLST internal types @@ -35,6 +35,8 @@ typedef __UINT64_TYPE__ uint64_t; #include #endif +typedef uint8_t byte; + #ifdef __cplusplus extern "C" { #elif defined(__BLST_CGO__) @@ -64,17 +66,39 @@ typedef enum { BLST_BAD_SCALAR, } BLST_ERROR; -typedef uint8_t byte; - // field elements F_r // where `r` is the order of G1/G2. // F_r elements are represented as big numbers reduced modulo `r`. Big numbers // are represented as a little endian vector of limbs. -// `Fr` is equivalent to type vec256 (used internally by BLST for F_r elements). -typedef struct {limb_t limbs[4];} Fr; +// `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements). +// `Fr` is defined as a struct to be exportable through cgo to the Go layer. +typedef struct {limb_t limbs[Fr_LIMBS];} Fr; + +// field elements F_p +// F_p elements are represented as big numbers reduced modulo `p`. Big numbers +// are represented as a little endian vector of limbs. +// `Fp` is equivalent to type `vec384` (used internally by BLST for F_p elements). +// `Fp` does not need to be exported to cgo. +typedef vec384 Fp; + // Subroup G1 in E1 -typedef POINTonE1 G1; -// Subroup G1 in E2 -typedef POINTonE2 G2; +// G1 points are represented in Jacobian coordinates (x,y,z), +// where x, y, x are elements of F_p (type `Fp`). +// `G1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements) +// `G1` is defined as a struct to be exportable through cgo to the Go layer. +typedef struct {Fp x,y,z} G1; + +// field elements F_p^2 +// F_p^2 elements are represented as a vector of two F_p elements. +// `Fp2` is equivalent to type `vec384x` (used internally by BLST for F_p^2 elements). +// `Fp2` does not need to be exported to cgo. +typedef vec384x Fp2; + +// Subroup G2 in E2 +// G2 points are represented in Jacobian coordinates (x,y,z), +// where x, y, x are elements of F_p (type `Fp`). +// `G2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E1 elements) +// `G2` is defined as a struct to be exportable through cgo to the Go layer. +typedef struct {Fp2 x,y,z} G2; #endif From e783a4289e4c02d5e63fc8be8014ce34f6850ff1 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 5 Apr 2023 18:57:40 -0600 Subject: [PATCH 025/200] new Fp and Fp2 tools, but still the mess --- crypto/bls12381_utils.c | 714 +++++++++++++++++++++++---------------- crypto/bls12381_utils.go | 6 +- crypto/bls12381_utils.h | 8 +- crypto/dkg_core.c | 2 +- 4 files changed, 438 insertions(+), 292 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index f1a93173971..3d2a1b99f6a 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -23,7 +23,101 @@ int get_Fr_BYTES() { return Fr_BYTES; } -// Fr utilities + +// Initializes Relic context with BLS12-381 parameters +ctx_t* relic_init_BLS12_381() { + // check Relic was compiled with the right conf + assert(ALLOC == AUTO); + + // sanity check of Relic constants the package is relying on + assert(RLC_OK == RLC_EQ); + + // initialize relic core with a new context + ctx_t* bls_ctx = (ctx_t*) calloc(1, sizeof(ctx_t)); + if (!bls_ctx) return NULL; + core_set(bls_ctx); + if (core_init() != RLC_OK) return NULL; + + // init BLS curve + int ret = RLC_OK; + #if (FP_PRIME == 381) + ret = ep_param_set_any_pairf(); // sets B12_P381 if FP_PRIME = 381 in relic config + #else + ep_param_set(B12_P381); + ep2_curve_set_twist(EP_MTYPE); // Multiplicative twist + #endif + + if (ret != RLC_OK) return NULL; + return core_get(); +} + +// seeds relic PRG +void seed_relic(byte* seed, int len) { + #if RAND == HASHD + // instantiate a new DRBG + ctx_t *ctx = core_get(); + ctx->seeded = 0; + #endif + rand_seed(seed, len); +} + +// global variable of the pre-computed data +prec_st bls_prec_st; +prec_st* bls_prec = NULL; + +// required constants for the optimized SWU hash to curve +#if (hashToPoint == LOCAL_SSWU) +extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS]; +extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS]; +#endif + +#if (MEMBERSHIP_CHECK_G1 == BOWE) +extern const uint64_t beta_data[Fp_LIMBS]; +extern const uint64_t z2_1_by3_data[2]; +#endif + +// sets the global variable to input +void precomputed_data_set(const prec_st* p) { + bls_prec = (prec_st*)p; +} + +// pre-compute some data required for curve BLS12-381 +prec_st* init_precomputed_data_BLS12_381() { + bls_prec = &bls_prec_st; + ctx_t* ctx = core_get(); + + // (p-1)/2 + bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2); + #if (hashToPoint == LOCAL_SSWU) + // (p-3)/4 + bn_div_dig(&bls_prec->p_3div4, &bls_prec->p_1div2, 2); + // sqrt(-z) + fp_neg(bls_prec->sqrt_z, ctx->ep_map_u); + fp_srt(bls_prec->sqrt_z, bls_prec->sqrt_z); + // -a1 and a1*z + fp_neg(bls_prec->minus_a1, ctx->ep_iso.a); + fp_mul(bls_prec->a1z, ctx->ep_iso.a, ctx->ep_map_u); + + for (int i=0; iiso_Nx[i], iso_Nx_data[i]); + for (int i=0; iiso_Ny[i], iso_Ny_data[i]); + #endif + + #if (MEMBERSHIP_CHECK_G1 == BOWE) + bn_new(&bls_prec->beta); + bn_read_raw(&bls_prec->beta, beta_data, Fp_LIMBS); + bn_new(&bls_prec->z2_1_by3); + bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2); + #endif + + // Montgomery constant R + fp_set_dig(bls_prec->r, 1); + return bls_prec; +} + +// ------------------- Fr utilities + // Montgomery constant R related to the curve order r const limb_t BLS12_381_rR[Fr_LIMBS] = { /* (1<<256)%r */ TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), @@ -187,18 +281,19 @@ static void pow256_from_Fr(pow256 ret, const Fr* in) { } // reads a scalar in `a` and checks it is a valid Fr element (a < r). -// input bytes are big endian. +// input is bytes-big-endian. // returns: // - BLST_BAD_ENCODING if the length is invalid // - BLST_BAD_SCALAR if the scalar isn't in Fr -// - v if the scalar is valid +// - BLST_SUCCESS if the scalar is valid BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) { if (len != Fr_BYTES) { return BLST_BAD_ENCODING; } pow256 tmp; + // compare to r using the provided tool from BLST pow256_from_be_bytes(tmp, bin); - if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256! + if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256! return BLST_BAD_SCALAR; } vec_zero(tmp, Fr_BYTES); @@ -230,9 +325,8 @@ void Fr_write_bytes(uint8_t *bin, const Fr* a) { } // maps big-endian bytes into an Fr element using modular reduction -// output is vec256 (also used as Fr) -static void -vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n) +// Input is byte-big-endian, output is vec256 (also used as Fr) +static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n) { Fr digit, radix; Fr_set_zero(out); @@ -257,236 +351,93 @@ vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n) Fr_set_zero(&digit); } -// Reads a scalar from an array and maps it to Fr. +// Reads a scalar from an array and maps it to Fr using modular reduction. +// Input is byte-big-endian as used by the external APIs. // It returns true if scalar is zero and false otherwise. bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { vec256_from_be_bytes(a, bin, len); - //Fr_set_limb(a, 1); TODO: delete return Fr_is_zero(a); } -// global variable of the pre-computed data -prec_st bls_prec_st; -prec_st* bls_prec = NULL; - -// required constants for the optimized SWU hash to curve -#if (hashToPoint == LOCAL_SSWU) -extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS]; -extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS]; -#endif - -#if (MEMBERSHIP_CHECK_G1 == BOWE) -extern const uint64_t beta_data[Fp_LIMBS]; -extern const uint64_t z2_1_by3_data[2]; -#endif - -// sets the global variable to input -void precomputed_data_set(const prec_st* p) { - bls_prec = (prec_st*)p; -} - -// Reads a prime field element from a digit vector in big endian format. -// There is no conversion to Montgomery domain in this function. - #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS) - -// pre-compute some data required for curve BLS12-381 -prec_st* init_precomputed_data_BLS12_381() { - bls_prec = &bls_prec_st; - ctx_t* ctx = core_get(); - - // (p-1)/2 - bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2); - #if (hashToPoint == LOCAL_SSWU) - // (p-3)/4 - bn_div_dig(&bls_prec->p_3div4, &bls_prec->p_1div2, 2); - // sqrt(-z) - fp_neg(bls_prec->sqrt_z, ctx->ep_map_u); - fp_srt(bls_prec->sqrt_z, bls_prec->sqrt_z); - // -a1 and a1*z - fp_neg(bls_prec->minus_a1, ctx->ep_iso.a); - fp_mul(bls_prec->a1z, ctx->ep_iso.a, ctx->ep_map_u); - - for (int i=0; iiso_Nx[i], iso_Nx_data[i]); - for (int i=0; iiso_Ny[i], iso_Ny_data[i]); - #endif - - #if (MEMBERSHIP_CHECK_G1 == BOWE) - bn_new(&bls_prec->beta); - bn_read_raw(&bls_prec->beta, beta_data, Fp_LIMBS); - bn_new(&bls_prec->z2_1_by3); - bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2); - #endif - - // Montgomery constant R - fp_set_dig(bls_prec->r, 1); - return bls_prec; -} - -// Initializes Relic context with BLS12-381 parameters -ctx_t* relic_init_BLS12_381() { - // check Relic was compiled with the right conf - assert(ALLOC == AUTO); - - // sanity check of Relic constants the package is relying on - assert(RLC_OK == RLC_EQ); - - // initialize relic core with a new context - ctx_t* bls_ctx = (ctx_t*) calloc(1, sizeof(ctx_t)); - if (!bls_ctx) return NULL; - core_set(bls_ctx); - if (core_init() != RLC_OK) return NULL; - - // init BLS curve - int ret = RLC_OK; - #if (FP_PRIME == 381) - ret = ep_param_set_any_pairf(); // sets B12_P381 if FP_PRIME = 381 in relic config - #else - ep_param_set(B12_P381); - ep2_curve_set_twist(EP_MTYPE); // Multiplicative twist - #endif - - if (ret != RLC_OK) return NULL; - return core_get(); -} - -// seeds relic PRG -void seed_relic(byte* seed, int len) { - #if RAND == HASHD - // instantiate a new DRBG - ctx_t *ctx = core_get(); - ctx->seeded = 0; - #endif - rand_seed(seed, len); -} - -// Exponentiation of a generic point p in G1 -void ep_mult(ep_t res, const ep_t p, const Fr *expo) { - bn_st* tmp_expo = Fr_blst_to_relic(expo); - // Using window NAF of size 2 - ep_mul_lwnaf(res, p, tmp_expo); - free(tmp_expo); -} - -// Exponentiation of generator g1 in G1 -// These two function are here for bench purposes only -void ep_mult_gen_bench(ep_t res, const Fr* expo) { - bn_st* tmp_expo = Fr_blst_to_relic(expo); - // Using precomputed table of size 4 - ep_mul_gen(res, tmp_expo); - free(tmp_expo); -} +// ------------------- Fp utilities -void ep_mult_generic_bench(ep_t res, const Fr* expo) { - // generic point multiplication - ep_mult(res, &core_get()->ep_g, expo); -} +// Montgomery constant R related to the prime p +const limb_t BLS12_381_pR[Fp_LIMBS] = { ONE_MONT_P }; /* (1<<384)%p */ -// Exponentiation of a generic point p in G2 -void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) { - bn_st* tmp_expo = Fr_blst_to_relic(expo); - // Using window NAF of size 2 - ep2_mul_lwnaf(res, p, tmp_expo); - free(tmp_expo); +// sets `a` to 0 +void Fp_set_zero(Fp* a){ + vec_zero((byte*)a, Fp_BYTES); } -// Exponentiation of generator g2 in G2 -void G2_mult_gen(G2* res, const Fr* expo) { - pow256 tmp; - pow256_from_Fr(tmp, expo); - POINTonE2_sign(res, &BLS12_381_G2, tmp); +// sets `a` to limb `l` +void Fp_set_limb(Fp* a, const limb_t l){ + vec_zero((byte*)a + sizeof(limb_t), Fp_BYTES - sizeof(limb_t)); + *((limb_t*)a) = l; } -// DEBUG printing functions -void bytes_print_(char* s, byte* data, int len) { - printf("[%s]:\n", s); - for (int i=0; i (p - 1)/2 and 0 otherwise. -static int fp_get_sign(const fp_t y) { - bn_t bn_y; - bn_new(bn_y); - fp_prime_back(bn_y, y); - return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT; +// res = a^2 * R^(-1) +void Fp_squ_montg(Fp *res, const Fp *a) { + sqr_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_P, p0); } -// ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form. -// len is the allocated size of the buffer bin. -// The serialization is following: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// The code is a modified version of Relic ep_write_bin -void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { - const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); - - if (len!=G1_size) { - RLC_THROW(ERR_NO_BUFFER); - return; - } - - if (ep_is_infty(a)) { - // set the infinity bit - bin[0] = (G1_SERIALIZATION << 7) | 0x40; - memset(bin+1, 0, G1_size-1); - return; - } - - RLC_TRY { - ep_t t; - ep_null(t); - ep_new(t); - ep_norm(t, a); - fp_write_bin(bin, Fp_BYTES, t->x); +// res = a*R +void Fp_to_montg(Fp *res, const Fp *a) { + mul_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_RR, BLS12_381_P, p0); +} - if (G1_SERIALIZATION == COMPRESSED) { - bin[0] |= (fp_get_sign(t->y) << 5); - } else { - fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y); - } - ep_free(t); - } RLC_CATCH_ANY { - RLC_THROW(ERR_CAUGHT); - } +// res = a*R^(-1) +void Fp_from_montg(Fp *res, const Fp *a) { + from_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_P, p0); +} - bin[0] |= (G1_SERIALIZATION << 7); - } +// reads a scalar in `a` and checks it is a valid Fp element (a < p). +// input is bytes-big-endian. +// returns: +// - BLST_BAD_ENCODING if the length is invalid +// - BLST_BAD_SCALAR if the scalar isn't in Fp +// - BLST_SUCCESS if the scalar is valid +BLST_ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) { + if (len != Fp_BYTES) { + return BLST_BAD_ENCODING; + } + limbs_from_be_bytes((limb_t*)a, bin, Fp_BYTES); + // compare read scalar to p + if (!check_Fp(a)) { + return BLST_BAD_ENCODING; + } + return BLST_SUCCESS; +} // fp_read_bin_safe is a modified version of Relic's (void fp_read_bin). // It reads a field element from a buffer and makes sure the big number read can be @@ -526,6 +477,79 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) { return ret; } +// Reads a prime field element from a digit vector in big endian format. +// There is no conversion to Montgomery domain in this function. + #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS) + +// returns the sign of y. +// 1 if y > (p - 1)/2 and 0 otherwise. +// y is in montgomery form +static int Fp_get_sign(const fp_t y) { + sgn0_pty_mont_384(y, BLS12_381_P, p0); +} + +// ------------------- Fp^2 utilities + +// sets `a` to limb `l` +void Fp2_set_limb(Fp2* a, const limb_t l){ + Fp_set_limb(a[0], l); // TODO: check!! + Fp_set_zero(a[1]); +} + +void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) { + add_mod_384x(res, a, b, BLS12_381_P); +} + +void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) { + sub_mod_384x(res, a, b, BLS12_381_P); +} + +void Fp2_neg(Fp2 *res, const Fp2 *a) { + cneg_mod_384(res[0], a[0], 1, BLS12_381_P); + cneg_mod_384(res[1], a[1], 1, BLS12_381_P); +} + +// res = a*b in montgomery form +void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) { + mul_mont_384x(res, a, b, BLS12_381_P, p0); +} + +// res = a^2 in montgomery form +void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { + sqr_mont_384x(res, a, BLS12_381_P, p0); +} + +// returns the sign of y. +// sign(y_0) if y_1 = 0, else sign(y_1) +// y coordinates are in montgommery form +static int Fp2_get_sign(fp2_t y) { + sgn0_pty_mont_384x(y, BLS12_381_P, p0); +} + +// reads an Fp^2 element in `a`. +// input is a serialization of a[1] concatenated to serializetion of a[0]. +// a[i] are both Fp elements. +// returns: +// - BLST_BAD_ENCODING if the length is invalid +// - BLST_BAD_SCALAR if the scalar isn't in Fp +// - BLST_SUCCESS if the scalar is valid +static BLST_ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) { + if (len != Fp2_BYTES) { + return BLST_BAD_ENCODING; + } + BLST_ERROR ret = Fp_read_bytes(a[0], bin, Fp_BYTES); + if (ret != BLST_SUCCESS) { + return ret; + } + ret = Fp_read_bytes(a[1], bin + Fp_BYTES, Fp_BYTES); + if ( ret != BLST_SUCCESS) { + return ret; + } + return BLST_SUCCESS; +} + +// ------------------- G1 utilities + // ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form. // len is the size of the input buffer. // @@ -600,92 +624,92 @@ int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { return RLC_ERR; } - -// returns the sign of y. -// sign(y_0) if y_1 = 0, else sign(y_1) -static int fp2_get_sign(fp2_t y) { - if (fp_is_zero(y[1])) { // no need to convert back as the montgomery form of 0 is 0 - return fp_get_sign(y[0]); - } - return fp_get_sign(y[1]); -} - -// ep2_write_bin_compact exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form. +// ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form. // len is the allocated size of the buffer bin. // The serialization is following: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// The code is a modified version of Relic ep2_write_bin -void ep2_write_bin_compact(byte *bin, const ep2_t a, const int len) { - ep2_t t; - ep2_null(t); - const int G2_size = (G2_BYTES/(G2_SERIALIZATION+1)); +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +// The code is a modified version of Relic ep_write_bin +void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { + const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); - if (len!=G2_size) { + if (len!=G1_size) { RLC_THROW(ERR_NO_BUFFER); return; } - if (ep2_is_infty((ep2_st *)a)) { + if (ep_is_infty(a)) { // set the infinity bit - bin[0] = (G2_SERIALIZATION << 7) | 0x40; - memset(bin+1, 0, G2_size-1); + bin[0] = (G1_SERIALIZATION << 7) | 0x40; + memset(bin+1, 0, G1_size-1); return; } RLC_TRY { - ep2_new(t); - ep2_norm(t, (ep2_st *)a); - fp2_write_bin(bin, Fp2_BYTES, t->x, 0); + ep_t t; + ep_null(t); + ep_new(t); + ep_norm(t, a); + fp_write_bin(bin, Fp_BYTES, t->x); - if (G2_SERIALIZATION == COMPRESSED) { - bin[0] |= (fp2_get_sign(t->y) << 5); + if (G1_SERIALIZATION == COMPRESSED) { + bin[0] |= (Fp_get_sign(t->y) << 5); } else { - fp2_write_bin(bin + Fp2_BYTES, Fp2_BYTES, t->y, 0); + fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y); } + ep_free(t); } RLC_CATCH_ANY { RLC_THROW(ERR_CAUGHT); } - bin[0] |= (G2_SERIALIZATION << 7); - ep_free(t); + bin[0] |= (G1_SERIALIZATION << 7); + } + +// Exponentiation of a generic point p in G1 +void ep_mult(ep_t res, const ep_t p, const Fr *expo) { + bn_st* tmp_expo = Fr_blst_to_relic(expo); + // Using window NAF of size 2 + ep_mul_lwnaf(res, p, tmp_expo); + free(tmp_expo); } -// fp2_read_bin_safe is a modified version of Relic's (void fp2_read_bin). -// It reads an Fp^2 element from a buffer and makes sure the big numbers read can be -// written as field elements (are reduced modulo p). -// Unlike Relic's versions, the function does not reduce the read integers modulo p and does -// not throw an exception for integers larger than p. The function returns RLC_OK if the input -// corresponds to a field element in Fp^2, and returns RLC_ERR otherwise. -static int fp2_read_bin_safe(fp2_t a, const uint8_t *bin, int len) { - if (len != Fp2_BYTES) { - return RLC_ERR; - } - if (fp_read_bin_safe(a[0], bin, Fp_BYTES) != RLC_OK) { - return RLC_ERR; - } - if (fp_read_bin_safe(a[1], bin + Fp_BYTES, Fp_BYTES) != RLC_OK) { - return RLC_ERR; - } - return RLC_OK; +// Exponentiation of generator g1 in G1 +// These two function are here for bench purposes only +void ep_mult_gen_bench(ep_t res, const Fr* expo) { + bn_st* tmp_expo = Fr_blst_to_relic(expo); + // Using precomputed table of size 4 + ep_mul_gen(res, tmp_expo); + free(tmp_expo); +} + +void ep_mult_generic_bench(ep_t res, const Fr* expo) { + // generic point multiplication + ep_mult(res, &core_get()->ep_g, expo); } -// ep2_read_bin_compact imports a point from a buffer in a compressed or uncompressed form. +// ------------------- G2 utilities + +// G2_read_bytes imports a point from a buffer in a compressed or uncompressed form. // The resulting point is guaranteed to be on curve E2. // -// It returns RLC_OK if the inputs are valid (input buffer lengths are valid and read coordinates -// correspond to a point on curve) and the execution completes and RLC_ERR otherwise. -// The code is a modified version of Relic ep2_read_bin -int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) { +// reads a scalar in `a` and checks it is a valid Fp element (a < p). +// input is bytes-big-endian. +// returns: +// - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid +// - BLST_BAD_SCALAR if Fp^2 coordinates couldn't deserialize +// - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E2 +// - BLST_SUCCESS if deserialization is valid + +// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z ? +BLST_ERROR G2_read_bytes(G2* a, const byte *bin, const int len) { // check the length - const int G2size = (G2_BYTES/(G2_SERIALIZATION+1)); - if (len!=G2size) { - return RLC_ERR; + if (len != G2_SER_BYTES) { + return BLST_BAD_ENCODING; } // check the compression bit int compressed = bin[0] >> 7; if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { - return RLC_ERR; + return BLST_BAD_ENCODING; } // check if the point in infinity @@ -693,54 +717,129 @@ int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) { if (is_infinity) { // the remaining bits need to be cleared if (bin[0] & 0x3F) { - return RLC_ERR; + return BLST_BAD_ENCODING; } - for (int i=1; i> 5) & 1; if (y_sign && (!compressed)) { - return RLC_ERR; + return BLST_BAD_ENCODING; } - a->coord = BASIC; - fp2_set_dig(a->z, 1); // a.z // use a temporary buffer to mask the header bits and read a.x byte temp[Fp2_BYTES]; memcpy(temp, bin, Fp2_BYTES); temp[0] &= 0x1F; // clear the header bits - if (fp2_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) { - return RLC_ERR; + BLST_ERROR ret = fp2_read_bytes(a->x, temp, sizeof(temp)); + if (ret != BLST_SUCCESS) { + return ret; } + // set a.z to 1 + Fp_copy(a->z[0], BLS12_381_pR); + Fp_set_zero(a->z[1]); + if (G2_SERIALIZATION == UNCOMPRESSED) { - if (fp2_read_bin_safe(a->y, bin + Fp2_BYTES, Fp2_BYTES) != RLC_OK){ - return RLC_ERR; + ret = fp2_read_bytes(a->y, bin + Fp2_BYTES, sizeof(a->y)); + if (ret != BLST_SUCCESS){ + return ret; } // check read point is on curve - if (!ep2_on_curve(a)) { - return RLC_ERR; + if (!G2_on_curve(a)) { + return BLST_POINT_NOT_ON_CURVE; } - return RLC_OK; + return BLST_SUCCESS; } - fp2_zero(a->y); - fp_set_bit(a->y[0], 0, y_sign); - fp_zero(a->y[1]); - if (ep2_upk(a, a) == 1) { - // resulting point is guaranteed to be on curve - return RLC_OK; + // compute the possible square root + Fp_to_montg((a->x)[0], a->x[0]); + Fp_to_montg(a->x[1], a->x[1]); + + Fp2_squ_montg(a->y, a->x); + Fp2_mul_montg(a->y, a->y, a->x); + Fp2_add(a->y, a->y, B_E2); + if (!sqrt_fp2(a->y, a->y)) // (y^2 = x^3+b) has no solution in y + return BLST_POINT_NOT_ON_CURVE; + + // resulting (x,y) is guaranteed to be on curve + if (Fp2_get_sign(a->y) != y_sign) { + Fp2_neg(a->y, a->y); // flip y sign if needed } - return RLC_ERR; + return BLST_SUCCESS; +} + +// ep2_write_bin_compact exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form. +// len is the allocated size of the buffer bin. +// The serialization is following: +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +// The code is a modified version of Relic ep2_write_bin +void ep2_write_bin_compact(byte *bin, const ep2_t a, const int len) { + ep2_t t; + ep2_null(t); + const int G2_size = (G2_BYTES/(G2_SERIALIZATION+1)); + + if (len!=G2_size) { + RLC_THROW(ERR_NO_BUFFER); + return; + } + + if (ep2_is_infty((ep2_st *)a)) { + // set the infinity bit + bin[0] = (G2_SERIALIZATION << 7) | 0x40; + memset(bin+1, 0, G2_size-1); + return; + } + + RLC_TRY { + ep2_new(t); + ep2_norm(t, (ep2_st *)a); + fp2_write_bin(bin, Fp2_BYTES, t->x, 0); + + if (G2_SERIALIZATION == COMPRESSED) { + bin[0] |= (Fp2_get_sign(t->y) << 5); + } else { + fp2_write_bin(bin + Fp2_BYTES, Fp2_BYTES, t->y, 0); + } + } RLC_CATCH_ANY { + RLC_THROW(ERR_CAUGHT); + } + + bin[0] |= (G2_SERIALIZATION << 7); + ep_free(t); +} + +// set p to infinity +static void G2_set_infty(G2* p) { + vec_zero(p, G2_BYTES); +} + +// checks p is on G2 +static bool G2_on_curve(G2* p) { + return POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p)); +} + +// Exponentiation of a generic point p in G2 +void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) { + bn_st* tmp_expo = Fr_blst_to_relic(expo); + // Using window NAF of size 2 + ep2_mul_lwnaf(res, p, tmp_expo); + free(tmp_expo); } +// Exponentiation of generator g2 in G2 +void G2_mult_gen(G2* res, const Fr* expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE2_sign(res, &BLS12_381_G2, tmp); +} // computes the sum of the G2 array elements y and writes the sum in jointy void ep2_sum_vector(ep2_t jointy, ep2_st* y, const int len){ @@ -753,6 +852,9 @@ void ep2_sum_vector(ep2_t jointy, ep2_st* y, const int len){ // public key } +// ------------------- other + + // Verifies the validity of 2 SPoCK proofs and 2 public keys. // Membership check in G1 of both proofs is verified in this function. // Membership check in G2 of both keys is not verified in this function. @@ -1022,3 +1124,45 @@ void ep2_rand_G2complement(ep2_t p) { void xmd_sha256(uint8_t *hash, int len_hash, uint8_t *msg, int len_msg, uint8_t *dst, int len_dst){ md_xmd_sh256(hash, len_hash, msg, len_msg, dst, len_dst); } + + +// DEBUG printing functions +void bytes_print_(char* s, byte* data, int len) { + printf("[%s]:\n", s); + for (int i=0; i Date: Tue, 11 Apr 2023 17:39:33 -0600 Subject: [PATCH 026/200] first changes to use new type G2 --- crypto/bls.go | 12 +- crypto/bls12381_utils.c | 298 ++++++++++++++++++++++--------- crypto/bls12381_utils.go | 38 ++-- crypto/bls12381_utils.h | 55 +++--- crypto/bls12381_utils_test.go | 42 ++--- crypto/bls_core.c | 27 +-- crypto/bls_include.h | 10 +- crypto/bls_multisig.go | 23 ++- crypto/bls_test.go | 26 +-- crypto/bls_thresholdsign.go | 4 +- crypto/bls_thresholdsign_test.go | 3 +- crypto/blst_include.h | 28 +-- crypto/{ => blst_src}/blst_src.c | 1 + crypto/blst_src/client_min_pk.c | 17 -- crypto/blst_src/client_min_sig.c | 17 -- crypto/dkg.go | 3 + crypto/dkg_core.c | 14 +- crypto/dkg_feldmanvss.go | 17 +- crypto/dkg_feldmanvssq.go | 7 +- crypto/dkg_include.h | 2 +- crypto/dkg_jointfeldman.go | 11 +- crypto/dkg_test.go | 3 +- crypto/spock.go | 6 +- crypto/spock_test.go | 2 + crypto/thresholdsign.go | 2 + 25 files changed, 409 insertions(+), 259 deletions(-) rename crypto/{ => blst_src}/blst_src.c (99%) delete mode 100644 crypto/blst_src/client_min_pk.c delete mode 100644 crypto/blst_src/client_min_sig.c diff --git a/crypto/bls.go b/crypto/bls.go index 447ba6f532e..66f4c809e85 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -211,7 +211,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) return false, nil } - verif := C.bls_verify((*C.ep2_st)(&pk.point), + verif := C.bls_verify((*C.G2)(&pk.point), (*C.uchar)(&s[0]), (*C.uchar)(&h[0]), (C.int)(len(h))) @@ -352,7 +352,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err } // membership check in G2 - if C.check_membership_G2((*C.ep2_st)(&pk.point)) != valid { + if C.G2_check_membership((*C.G2)(&pk.point)) != valid { return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group") } @@ -498,15 +498,15 @@ func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte { if serializationG2 != compressed { panic("library is not configured to use compressed public key serialization") } - return a.Encode() + dest := make([]byte, pubKeyLengthBLSBLS12381) + writePointG2(dest, &a.point) + return dest } // Encode returns a byte encoding of the public key. // Since we use a compressed encoding by default, this delegates to EncodeCompressed func (a *pubKeyBLSBLS12381) Encode() []byte { - dest := make([]byte, pubKeyLengthBLSBLS12381) - writePointG2(dest, &a.point) - return dest + return a.EncodeCompressed() } // Equals checks is two public keys are equal diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 3d2a1b99f6a..b66be0932a2 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -8,6 +8,8 @@ #include "bls_include.h" #include "assert.h" +#include "blst_src.c" + // The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) // return macro values to the upper Go Layer @@ -81,8 +83,13 @@ void precomputed_data_set(const prec_st* p) { bls_prec = (prec_st*)p; } +// Reads a prime field element from a digit vector in big endian format. +// There is no conversion to Montgomery domain in this function. +#define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS) + // pre-compute some data required for curve BLS12-381 prec_st* init_precomputed_data_BLS12_381() { + bls_prec = &bls_prec_st; ctx_t* ctx = core_get(); @@ -375,21 +382,11 @@ void Fp_set_limb(Fp* a, const limb_t l){ *((limb_t*)a) = l; } -static bool check_Fp(byte *in) { - // use same method as in BLST internal function - // which seems the most efficient. The method uses the assembly-based - // modular addition instead of limbs comparison - vec384 temp; - add_fp(temp, in, ZERO_384); - return vec_is_equal(temp, in, Fp_BYTES); - // no need to clear `tmp` as no use-case involves sensitive data being passed as `in` -} - void Fp_copy(Fp* res, const Fp* a) { vec_copy((byte*)res, (byte*)a, Fp_BYTES); } -void Fp_add(Fp *res, const Fp *a, const Fp *b) { +static void Fp_add(Fp *res, const Fp *a, const Fp *b) { add_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P); } @@ -401,6 +398,16 @@ void Fp_neg(Fp *res, const Fp *a) { cneg_mod_384((limb_t*)res, (limb_t*)a, 1, BLS12_381_P); } +static bool check_Fp(const Fp* in) { + // use same method as in BLST internal function + // which seems the most efficient. The method uses the assembly-based + // modular addition instead of limbs comparison + Fp temp; + Fp_add(&temp, in, &ZERO_384); + return vec_is_equal(&temp, in, Fp_BYTES); + // no need to clear `tmp` as no use-case involves sensitive data being passed as `in` +} + // res = a*b*R^(-1) void Fp_mul_montg(Fp *res, const Fp *a, const Fp *b) { mul_mont_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P, p0); @@ -439,6 +446,12 @@ BLST_ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) { return BLST_SUCCESS; } + +// write Fp element to bin and assume `bin` has `Fp_BYTES` allocated bytes. +void Fp_write_bytes(byte *bin, const Fp* a) { + be_bytes_from_limbs(bin, (limb_t*)a, Fp_BYTES); +} + // fp_read_bin_safe is a modified version of Relic's (void fp_read_bin). // It reads a field element from a buffer and makes sure the big number read can be // written as a field element (is reduced modulo p). @@ -477,57 +490,59 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) { return ret; } -// Reads a prime field element from a digit vector in big endian format. -// There is no conversion to Montgomery domain in this function. - #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS) - // returns the sign of y. // 1 if y > (p - 1)/2 and 0 otherwise. // y is in montgomery form -static int Fp_get_sign(const fp_t y) { - sgn0_pty_mont_384(y, BLS12_381_P, p0); +static limb_t Fp_get_sign(const fp_t y) { + return sgn0_pty_mont_384(y, BLS12_381_P, p0); } // ------------------- Fp^2 utilities // sets `a` to limb `l` void Fp2_set_limb(Fp2* a, const limb_t l){ - Fp_set_limb(a[0], l); // TODO: check!! - Fp_set_zero(a[1]); + Fp_set_limb(&real(a), l); + Fp_set_zero(&imag(a)); } void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) { - add_mod_384x(res, a, b, BLS12_381_P); + add_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P); } void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) { - sub_mod_384x(res, a, b, BLS12_381_P); + sub_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P); } void Fp2_neg(Fp2 *res, const Fp2 *a) { - cneg_mod_384(res[0], a[0], 1, BLS12_381_P); - cneg_mod_384(res[1], a[1], 1, BLS12_381_P); + cneg_mod_384(real(res), real(a), 1, BLS12_381_P); + cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P); } // res = a*b in montgomery form void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) { - mul_mont_384x(res, a, b, BLS12_381_P, p0); + mul_mont_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P, p0); } // res = a^2 in montgomery form void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { - sqr_mont_384x(res, a, BLS12_381_P, p0); + sqr_mont_384x((vec384*)res, (vec384*)a, BLS12_381_P, p0); +} + +// checks if `a` is a quadratic residue in Fp^2. If yes, it computes +// the square root in `res`. +static bool_t Fp2_sqrt(Fp2 *res, const Fp2* a) { + return sqrt_fp2((vec384*)res, (vec384*)a); } // returns the sign of y. // sign(y_0) if y_1 = 0, else sign(y_1) // y coordinates are in montgommery form -static int Fp2_get_sign(fp2_t y) { - sgn0_pty_mont_384x(y, BLS12_381_P, p0); +static limb_t Fp2_get_sign(Fp2* y) { + return sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0); } // reads an Fp^2 element in `a`. -// input is a serialization of a[1] concatenated to serializetion of a[0]. +// input is a serialization of real(a) concatenated to serializetion of imag(a). // a[i] are both Fp elements. // returns: // - BLST_BAD_ENCODING if the length is invalid @@ -537,17 +552,23 @@ static BLST_ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) { if (len != Fp2_BYTES) { return BLST_BAD_ENCODING; } - BLST_ERROR ret = Fp_read_bytes(a[0], bin, Fp_BYTES); + BLST_ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES); if (ret != BLST_SUCCESS) { return ret; } - ret = Fp_read_bytes(a[1], bin + Fp_BYTES, Fp_BYTES); + ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES); if ( ret != BLST_SUCCESS) { return ret; } return BLST_SUCCESS; } +// write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes. +void Fp2_write_bytes(byte *bin, const Fp2* a) { + Fp_write_bytes(bin, &real(a)); + Fp_write_bytes(bin + Fp_BYTES, &imag(a)); +} + // ------------------- G1 utilities // ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form. @@ -686,10 +707,101 @@ void ep_mult_generic_bench(ep_t res, const Fr* expo) { ep_mult(res, &core_get()->ep_g, expo); } -// ------------------- G2 utilities +// ------------------- E2 utilities + +// TODO: to delete +static int fp2_read_bin_safe(fp2_t a, const uint8_t *bin, int len) { + if (len != Fp2_BYTES) { + return RLC_ERR; + } + if (fp_read_bin_safe(a[0], bin, Fp_BYTES) != RLC_OK) { + return RLC_ERR; + } + if (fp_read_bin_safe(a[1], bin + Fp_BYTES, Fp_BYTES) != RLC_OK) { + return RLC_ERR; + } + return RLC_OK; +} + +// TODO: to delete, only used by temporary E2_blst_to_relic +static int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) { + // check the length + const int G2size = (G2_BYTES/(G2_SERIALIZATION+1)); + if (len!=G2size) { + return RLC_ERR; + } + + // check the compression bit + int compressed = bin[0] >> 7; + if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { + return RLC_ERR; + } + + // check if the point in infinity + int is_infinity = bin[0] & 0x40; + if (is_infinity) { + // the remaining bits need to be cleared + if (bin[0] & 0x3F) { + return RLC_ERR; + } + for (int i=1; i> 5) & 1; + if (y_sign && (!compressed)) { + return RLC_ERR; + } + + a->coord = BASIC; + fp2_set_dig(a->z, 1); // a.z + // use a temporary buffer to mask the header bits and read a.x + byte temp[Fp2_BYTES]; + memcpy(temp, bin, Fp2_BYTES); + temp[0] &= 0x1F; // clear the header bits + if (fp2_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) { + return RLC_ERR; + } + + if (G2_SERIALIZATION == UNCOMPRESSED) { + if (fp2_read_bin_safe(a->y, bin + Fp2_BYTES, Fp2_BYTES) != RLC_OK){ + return RLC_ERR; + } + // check read point is on curve + if (!ep2_on_curve(a)) { + return RLC_ERR; + } + return RLC_OK; + } + + fp2_zero(a->y); + fp_set_bit(a->y[0], 0, y_sign); + fp_zero(a->y[1]); + if (ep2_upk(a, a) == 1) { + // resulting point is guaranteed to be on curve + return RLC_OK; + } + return RLC_ERR; +} + +// TODO: temp utility function to delete +ep2_st* E2_blst_to_relic(const G2* x) { + ep2_st* out = (ep2_st*)malloc(sizeof(ep2_st)); + byte* data = (byte*)malloc(G2_SER_BYTES); + E2_write_bytes(data, x); + ep2_read_bin_compact(out, data, G2_SER_BYTES); + free(data); + return out; +} + +// E2_read_bytes imports a point from a buffer in a compressed or uncompressed form. +// The resulting point is guaranteed to be on curve E2 (no G2 check is included) // // reads a scalar in `a` and checks it is a valid Fp element (a < p). // input is bytes-big-endian. @@ -699,8 +811,9 @@ void ep_mult_generic_bench(ep_t res, const Fr* expo) { // - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E2 // - BLST_SUCCESS if deserialization is valid -// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z ? -BLST_ERROR G2_read_bytes(G2* a, const byte *bin, const int len) { +// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, +// and update logic with G2 subgroup check? +BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) { // check the length if (len != G2_SER_BYTES) { return BLST_BAD_ENCODING; @@ -724,7 +837,7 @@ BLST_ERROR G2_read_bytes(G2* a, const byte *bin, const int len) { return BLST_BAD_ENCODING; } } - G2_set_infty(a); + E2_set_infty(a); return RLC_OK; } @@ -738,92 +851,113 @@ BLST_ERROR G2_read_bytes(G2* a, const byte *bin, const int len) { byte temp[Fp2_BYTES]; memcpy(temp, bin, Fp2_BYTES); temp[0] &= 0x1F; // clear the header bits - BLST_ERROR ret = fp2_read_bytes(a->x, temp, sizeof(temp)); + BLST_ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp)); if (ret != BLST_SUCCESS) { return ret; } // set a.z to 1 - Fp_copy(a->z[0], BLS12_381_pR); - Fp_set_zero(a->z[1]); + Fp2* a_z = &(a->z); + Fp_copy(&real(a_z), &BLS12_381_pR); + Fp_set_zero(&imag(a_z)); if (G2_SERIALIZATION == UNCOMPRESSED) { - ret = fp2_read_bytes(a->y, bin + Fp2_BYTES, sizeof(a->y)); + ret = Fp2_read_bytes(&(a->y), bin + Fp2_BYTES, sizeof(a->y)); if (ret != BLST_SUCCESS){ return ret; } // check read point is on curve - if (!G2_on_curve(a)) { + if (!E2_affine_on_curve(a)) { return BLST_POINT_NOT_ON_CURVE; } return BLST_SUCCESS; } // compute the possible square root - Fp_to_montg((a->x)[0], a->x[0]); - Fp_to_montg(a->x[1], a->x[1]); - - Fp2_squ_montg(a->y, a->x); - Fp2_mul_montg(a->y, a->y, a->x); - Fp2_add(a->y, a->y, B_E2); - if (!sqrt_fp2(a->y, a->y)) // (y^2 = x^3+b) has no solution in y + Fp2* a_x = &(a->x); + Fp_to_montg(&real(a_x), &real(a_x)); + Fp_to_montg(&imag(a_x), &imag(a_x)); + + Fp2* a_y = &(a->y); + Fp2_squ_montg(a_y, a_x); + Fp2_mul_montg(a_y, a_y, a_x); + Fp2_add(a_y, a_y, &B_E2); + if (!Fp2_sqrt(a_y, a_y)) // if (y^2 = x^3+b) has no solution in y return BLST_POINT_NOT_ON_CURVE; // resulting (x,y) is guaranteed to be on curve - if (Fp2_get_sign(a->y) != y_sign) { - Fp2_neg(a->y, a->y); // flip y sign if needed + if (Fp2_get_sign(a_y) != y_sign) { + Fp2_neg(a_y, a_y); // flip y sign if needed } return BLST_SUCCESS; } -// ep2_write_bin_compact exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form. -// len is the allocated size of the buffer bin. -// The serialization is following: +// E2_write_bytes exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form. +// It assumes buffer is of length G2_SER_BYTES +// The serialization follows: // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) // The code is a modified version of Relic ep2_write_bin -void ep2_write_bin_compact(byte *bin, const ep2_t a, const int len) { - ep2_t t; - ep2_null(t); - const int G2_size = (G2_BYTES/(G2_SERIALIZATION+1)); - - if (len!=G2_size) { - RLC_THROW(ERR_NO_BUFFER); - return; - } - - if (ep2_is_infty((ep2_st *)a)) { +void E2_write_bytes(byte *bin, const G2* a) { + if (E2_is_infty(a)) { // set the infinity bit bin[0] = (G2_SERIALIZATION << 7) | 0x40; - memset(bin+1, 0, G2_size-1); + memset(bin+1, 0, G2_SER_BYTES-1); return; } - RLC_TRY { - ep2_new(t); - ep2_norm(t, (ep2_st *)a); - fp2_write_bin(bin, Fp2_BYTES, t->x, 0); + G2 tmp; + E2_to_affine(&tmp, a); + Fp2* t_x = &(tmp.x); + Fp_from_montg(&real(t_x), &real(t_x)); + Fp_from_montg(&imag(t_x), &imag(t_x)); + Fp2_write_bytes(bin, t_x); - if (G2_SERIALIZATION == COMPRESSED) { - bin[0] |= (Fp2_get_sign(t->y) << 5); - } else { - fp2_write_bin(bin + Fp2_BYTES, Fp2_BYTES, t->y, 0); - } - } RLC_CATCH_ANY { - RLC_THROW(ERR_CAUGHT); + Fp2* t_y = &(tmp.y); + if (G2_SERIALIZATION == COMPRESSED) { + bin[0] |= (Fp2_get_sign(t_y) << 5); + } else { + Fp2_write_bytes(bin + Fp2_BYTES, t_y); } bin[0] |= (G2_SERIALIZATION << 7); - ep_free(t); } // set p to infinity -static void G2_set_infty(G2* p) { +void E2_set_infty(G2* p) { vec_zero(p, G2_BYTES); } -// checks p is on G2 -static bool G2_on_curve(G2* p) { - return POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p)); +// check if `p` is infinity +bool_t E2_is_infty(const G2* p) { + return vec_is_zero(p, sizeof(*p)); +} + +// checks affine point `p` is in E2 +bool_t E2_affine_on_curve(const G2* p) { + // BLST's `POINTonE2_affine_on_curve` does not include the inifity case, + // unlike what the function name means. + return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p); +} + +// checks p1 == p2 +bool_t E2_is_equal(const G2* p1, const G2* p2) { + // `POINTonE2_is_equal` includes the infinity case + return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2); +} + +// converts an E2 point from Jacobian into affine coordinates (z=1) +void E2_to_affine(G2* res, const G2* p) { + // minor optimization in case coordinates are already affine + if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) { + vec_copy(res, p, G2_BYTES); + return; + } + // convert from Jacobian + POINTonE2_from_Jacobian((POINTonE2*)res, (const POINTonE2*)p); +} + +void E2_add(G2* res, const G2* a, const G2* b) { + POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); } // Exponentiation of a generic point p in G2 @@ -838,7 +972,7 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) { void G2_mult_gen(G2* res, const Fr* expo) { pow256 tmp; pow256_from_Fr(tmp, expo); - POINTonE2_sign(res, &BLS12_381_G2, tmp); + POINTonE2_sign((POINTonE2*)res, &BLS12_381_G2, tmp); } // computes the sum of the G2 array elements y and writes the sum in jointy diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 82d13798455..9d59eb8d7d4 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -45,9 +45,10 @@ var invalid = C.get_invalid() // get some constants from the C layer // var blst_errors = C.blst_get_errors() -var blst_valid = (int)(C.BLST_SUCCESS) //int(blst_errors[0]) -var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING) // int(blst_errors[0]) -var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR) // int(blst_errors[0]) +var blst_valid = (int)(C.BLST_SUCCESS) +var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING) +var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR) +var blst_point_not_on_curve = (int)(C.BLST_POINT_NOT_ON_CURVE) // initContext sets relic B12_381 parameters and precomputes some data in the C layer func (ct *ctx) initContext() error { @@ -107,7 +108,7 @@ func (x *scalar) equals(other *scalar) bool { // comparison in G2 func (p *pointG2) equals(other *pointG2) bool { - return C.ep2_cmp((*C.ep2_st)(p), (*C.ep2_st)(other)) == valid + return C.E2_is_equal((*C.G2)(p), (*C.G2)(other)) != 0 } // Comparison to zero in Fr. @@ -118,7 +119,7 @@ func (x *scalar) isZero() bool { // Comparison to point at infinity in G2. func (p *pointG2) isInfinity() bool { - return C.ep2_is_infty((*C.ep2_st)(p)) == 1 + return C.E2_is_infty((*C.G2)(p)) != 10 } // returns a random element of Fr in input pointer @@ -165,9 +166,8 @@ func writeScalar(dest []byte, x *scalar) { // The slice should be of size PubKeyLenBLSBLS12381 and the serialization // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves func writePointG2(dest []byte, a *pointG2) { - C.ep2_write_bin_compact((*C.uchar)(&dest[0]), - (*C.ep2_st)(a), - (C.int)(pubKeyLengthBLSBLS12381), + C.E2_write_bytes((*C.uchar)(&dest[0]), + (*C.G2)(a), ) } @@ -207,13 +207,17 @@ func readScalarFrStar(a *scalar, src []byte) error { // The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves func readPointG2(a *pointG2, src []byte) error { - switch C.G2_read_bytes((*C.ep2_st)(a), + read := C.E2_read_bytes((*C.G2)(a), (*C.uchar)(&src[0]), - (C.int)(len(src))) { - case valid: + (C.int)(len(src))) + + switch int(read) { + case blst_valid: return nil - case invalid: - return invalidInputsErrorf("input is not a G2 point") + case blst_bad_encoding, blst_bad_scalar: + return invalidInputsErrorf("input could not deserialize to a G2 point") + case blst_point_not_on_curve: + return invalidInputsErrorf("input is not a point on curve E2") default: return errors.New("reading a G2 point failed") } @@ -244,7 +248,7 @@ func checkMembershipG1(pt *pointG1) int { // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used // in go test files. func checkMembershipG2(pt *pointG2) int { - return int(C.check_membership_G2((*C.ep2_st)(pt))) + return int(C.G2_check_membership((*C.G2)(pt))) } // randPointG1 wraps a call to C since cgo can't be used in go test files. @@ -259,17 +263,19 @@ func randPointG1Complement(pt *pointG1) { C.ep_rand_G1complement((*C.ep_st)(pt)) } +/* // randPointG2 wraps a call to C since cgo can't be used in go test files. // It generates a random point in G2 and stores it in input point. func randPointG2(pt *pointG2) { - C.ep2_rand_G2((*C.ep2_st)(pt)) + C.ep2_rand_G2((*C.G2)(pt)) } // randPointG1Complement wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. func randPointG2Complement(pt *pointG2) { - C.ep2_rand_G2complement((*C.ep2_st)(pt)) + C.ep2_rand_G2complement((*C.G2)(pt)) } +*/ // This is only a TEST function. // It hashes `data` to a G1 point using the tag `dst` and returns the G1 point serialization. diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index a6e88baa205..a6688c5871d 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -82,6 +82,7 @@ typedef struct prec_ { // TODO: to delete when Relic is removed bn_st* Fr_blst_to_relic(const Fr* x); +ep2_st* E2_blst_to_relic(const G2* x); int get_valid(); int get_invalid(); @@ -116,45 +117,49 @@ BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len); void Fr_write_bytes(uint8_t *bin, const Fr* a); bool map_bytes_to_Fr(Fr*, const uint8_t*, int); -// Utility functions -ctx_t* relic_init_BLS12_381(); -prec_st* init_precomputed_data_BLS12_381(); -void precomputed_data_set(const prec_st* p); -void seed_relic(byte*, int); - -int ep_read_bin_compact(ep_t, const byte *, const int); -void ep_write_bin_compact(byte *, const ep_t, const int); -int G2_read_bytes(ep2_t, const byte *, const int); -void ep2_write_bin_compact(byte *, const ep2_t, const int); - - - +// Fp utilities +// E1 and G1 utilities +int ep_read_bin_compact(ep_t, const byte *, const int); +void ep_write_bin_compact(byte *, const ep_t, const int); void ep_mult_gen_bench(ep_t, const Fr*); void ep_mult_generic_bench(ep_t, const Fr*); void ep_mult(ep_t, const ep_t, const Fr*); -void G2_mult_gen(ep2_t, const Fr*); -void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); - void ep_sum_vector(ep_t, ep_st*, const int); -void ep2_sum_vector(ep2_t, ep2_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); -void ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len); - -// membership checks int check_membership_G1(const ep_t); -int check_membership_G2(const ep2_t); - int simple_subgroup_check_G1(const ep_t); -int simple_subgroup_check_G2(const ep2_t); void ep_rand_G1(ep_t); void ep_rand_G1complement( ep_t); -void ep2_rand_G2(ep2_t); -void ep2_rand_G2complement( ep2_t); #if (MEMBERSHIP_CHECK_G1 == BOWE) int bowe_subgroup_check_G1(const ep_t); #endif +// E2 and G2 utilities +void E2_set_infty(G2* p); +bool_t E2_is_infty(const G2*); +bool_t E2_affine_on_curve(const G2*); +bool_t E2_is_equal(const G2* p1, const G2* p2); +void E2_to_affine(G2*, const G2*); +BLST_ERROR E2_read_bytes(G2*, const byte *, const int); +void E2_write_bytes(byte *, const G2*); +void G2_mult_gen(G2*, const Fr*); +void E2_add(G2* res, const G2* a, const G2* b); + +void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); +void ep2_sum_vector(ep2_t, ep2_st*, const int); +void ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len); +int G2_check_membership(const G2*); +int simple_subgroup_check_G2(const ep2_t); +void ep2_rand_G2(ep2_t); +void ep2_rand_G2complement( ep2_t); + +// Utility functions +ctx_t* relic_init_BLS12_381(); +prec_st* init_precomputed_data_BLS12_381(); +void precomputed_data_set(const prec_st* p); +void seed_relic(byte*, int); + // utility testing function void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index e7dba41a8eb..e8b34cbb052 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -116,16 +116,17 @@ func TestSubgroupCheck(t *testing.T) { res = checkMembershipG1(&p) assert.Equal(t, res, int(invalid)) }) - - t.Run("G2", func(t *testing.T) { - var p pointG2 - randPointG2(&p) // point in G2 - res := checkMembershipG2(&p) - assert.Equal(t, res, int(valid)) - randPointG2Complement(&p) // point in E2\G2 - res = checkMembershipG2(&p) - assert.Equal(t, res, int(invalid)) - }) + /* + t.Run("G2", func(t *testing.T) { + var p pointG2 + randPointG2(&p) // point in G2 + res := checkMembershipG2(&p) + assert.Equal(t, res, int(valid)) + randPointG2Complement(&p) // point in E2\G2 + res = checkMembershipG2(&p) + assert.Equal(t, res, int(invalid)) + }) + */ } // subgroup membership check bench @@ -140,14 +141,15 @@ func BenchmarkSubgroupCheck(b *testing.B) { } b.StopTimer() }) - - b.Run("G2", func(b *testing.B) { - var p pointG2 - randPointG2(&p) - b.ResetTimer() - for i := 0; i < b.N; i++ { - _ = checkMembershipG2(&p) // G2 - } - b.StopTimer() - }) + /* + b.Run("G2", func(b *testing.B) { + var p pointG2 + randPointG2(&p) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = checkMembershipG2(&p) // G2 + } + b.StopTimer() + }) + */ } diff --git a/crypto/bls_core.c b/crypto/bls_core.c index a1d47c73f17..03fa21ca782 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -40,10 +40,10 @@ int check_membership_G1(const ep_t p){ // // membership check in G2 is using a scalar multiplication by the group order. // TODO: switch to the faster Bowe check -int check_membership_G2(const ep2_t p){ +int G2_check_membership(const G2* p){ #if MEMBERSHIP_CHECK // check p is on curve - if (!ep2_on_curve((ep2_st*)p)) + if (!E2_affine_on_curve(p)) // TODO: remove and assume inputs are on curve? return INVALID; // check p is in G2 #if MEMBERSHIP_CHECK_G2 == EXP_ORDER @@ -84,7 +84,7 @@ void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { // and a message data. // The signature and public key are assumed to be in G1 and G2 respectively. This // function only checks the pairing equality. -static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const int len) { +static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int len) { ep_t elemsG1[2]; ep2_t elemsG2[2]; @@ -97,9 +97,11 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i // hash to G1 map_to_G1(elemsG1[1], data, len); + ep2_st* pk_tmp = E2_blst_to_relic(pk); + // elemsG2[1] = pk ep2_new(elemsG2[1]); - ep2_copy(elemsG2[1], (ep2_st*)pk); + ep2_copy(elemsG2[1], (ep2_st*)pk_tmp); #if DOUBLE_PAIRING // elemsG2[0] = -g2 @@ -126,6 +128,7 @@ static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const i ep_free(elemsG1[1]); ep2_free(elemsG2[0]); ep2_free(elemsG2[1]); + free(pk_tmp); if (core_get()->code == RLC_OK) { if (res == RLC_EQ) return VALID; @@ -326,7 +329,7 @@ int bls_verifyPerDistinctKey(const byte* sig, // membership check of the signature in G1 is verified. // membership check of pk in G2 is not verified in this function. // the membership check in G2 is separated to allow optimizing multiple verifications using the same key. -int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len) { +int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) { ep_t s; ep_new(s); @@ -343,6 +346,7 @@ int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len) return bls_verify_ep(pk, s, data, len); } +/* // binary tree structure to be used by bls_batch verify. // Each node contains a signature and a public key, the signature (resp. the public key) @@ -350,15 +354,15 @@ int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len) // The leaves contain the initial signatures and public keys. typedef struct st_node { ep_st* sig; - ep2_st* pk; + G2* pk; struct st_node* left; struct st_node* right; } node; -static node* new_node(const ep2_st* pk, const ep_st* sig){ +static node* new_node(const G2* pk, const ep_st* sig){ node* t = (node*) malloc(sizeof(node)); if (t) { - t->pk = (ep2_st*)pk; + t->pk = (G2*)pk; t->sig = (ep_st*)sig; t->right = t->left = NULL; } @@ -374,7 +378,6 @@ static void free_tree(node* root) { // the recursive build starts with the left side first // relic free if (root->sig) ep_free(root->sig); - if (root->pk) ep2_free(root->pk); // pointer free free(root->sig); free(root->pk); @@ -397,7 +400,7 @@ static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) { int left_len = len - right_len; // create a new node with new points - ep2_st* new_pk = (ep2_st*)malloc(sizeof(ep2_st)); + G2* new_pk = (G2*)malloc(sizeof(G2)); if (!new_pk) goto error; ep_st* new_sig = (ep_st*)malloc(sizeof(ep_st)); if (!new_sig) goto error_sig; @@ -405,7 +408,6 @@ static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) { node* t = new_node(new_pk, new_sig); if (!t) goto error_node; ep_new(t->sig); - ep2_new(t->pk); // build the tree in a top-down way t->left = build_tree(left_len, &pks[0], &sigs[0]); @@ -415,7 +417,7 @@ static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) { if (!t->right) { free_tree(t); goto error; } // sum the children ep_add_jacob(t->sig, t->left->sig, t->right->sig); - ep2_add_projc(t->pk, t->left->pk, t->right->pk); + E2_add(t->pk, t->left->pk, t->right->pk); return t; error_node: @@ -522,3 +524,4 @@ void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input, out_sigs: free(pks); } +*/ \ No newline at end of file diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 0e965bac88e..25bdf2020a7 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -21,13 +21,13 @@ #define SINGLE_PAIRING (DOUBLE_PAIRING^1) // Signature and public key membership check -#define MEMBERSHIP_CHECK 1 +#define MEMBERSHIP_CHECK 0 // TODO: switch to 1 and clean up memb check -// algorithm choice for the hashing to G1 -// both methods are similar implementations of the same optimzed SSWU +// algorithm choice for hashing to G1 +// both methods are similar implementations of the same optimized SSWU // but offer different timings. #define RELIC_SSWU 1 // relic library implementation -#define LOCAL_SSWU 2 // local implementation +#define LOCAL_SSWU 2 // local implementation #define hashToPoint LOCAL_SSWU // bls core (functions in bls_core.c) @@ -36,7 +36,7 @@ int get_pk_len(); int get_sk_len(); void bls_sign(byte*, const Fr*, const byte*, const int); -int bls_verify(const ep2_t, const byte*, const byte*, const int); +int bls_verify(const G2*, const byte*, const byte*, const int); int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, const uint32_t*, const ep2_st*); int bls_verifyPerDistinctKey(const byte*, diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index b4fa5918ef7..cf293726112 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -3,6 +3,7 @@ package crypto +/* import ( "errors" "fmt" @@ -13,7 +14,7 @@ import ( "github.com/onflow/flow-go/crypto/hash" _ "github.com/onflow/flow-go/crypto/hash" -) +)*/ // BLS multi-signature using BLS12-381 curve // ([zcash]https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#bls12-381) @@ -43,6 +44,7 @@ import "C" // used for signatures. var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite) +/* // BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key. // // The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used @@ -193,13 +195,13 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { } var sum pointG2 - C.ep2_sum_vector((*C.ep2_st)(&sum), (*C.ep2_st)(&points[0]), + C.ep2_sum_vector((*C.G2)(&sum), (*C.G2)(&points[0]), (C.int)(len(points))) sumKey := newPubKeyBLSBLS12381(&sum) return sumKey, nil } - +*/ // IdentityBLSPublicKey returns an identity public key which corresponds to the point // at infinity in G2 (identity element of G2). // TODO: return a constant key instead of a newly allocated one @@ -207,11 +209,13 @@ func IdentityBLSPublicKey() PublicKey { identity := *newPubKeyBLSBLS12381(nil) // set the point to infinity - C.ep2_set_infty((*C.ep2_st)(&identity.point)) + C.E2_set_infty((*C.G2)(&identity.point)) identity.isIdentity = true return &identity } +/* + // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key. // // The common use case assumes the aggregated public key was initially formed using @@ -248,8 +252,8 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, } var resultPoint pointG2 - C.ep2_subtract_vector((*C.ep2_st)(&resultPoint), (*C.ep2_st)(&aggPKBLS.point), - (*C.ep2_st)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract))) + C.ep2_subtract_vector((*C.G2)(&resultPoint), (*C.G2)(&aggPKBLS.point), + (*C.G2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract))) resultKey := newPubKeyBLSBLS12381(&resultPoint) return resultKey, nil @@ -403,7 +407,7 @@ func VerifyBLSSignatureManyMessages( (*C.uchar)(&flatDistinctHashes[0]), (*C.uint32_t)(&lenHashes[0]), (*C.uint32_t)(&pkPerHash[0]), - (*C.ep2_st)(&allPks[0]), + (*C.G2)(&allPks[0]), ) } else { @@ -425,7 +429,7 @@ func VerifyBLSSignatureManyMessages( verif = C.bls_verifyPerDistinctKey( (*C.uchar)(&s[0]), (C.int)(len(mapPerPk)), - (*C.ep2_st)(&distinctPks[0]), + (*C.G2)(&distinctPks[0]), (*C.uint32_t)(&hashPerPk[0]), (*C.uchar)(&flatHashes[0]), (*C.uint32_t)(&lenHashes[0])) @@ -521,7 +525,7 @@ func BatchVerifyBLSSignaturesOneMessage( C.bls_batchVerify( (C.int)(len(verifInt)), (*C.uchar)(&verifInt[0]), - (*C.ep2_st)(&pkPoints[0]), + (*C.G2)(&pkPoints[0]), (*C.uchar)(&flatSigs[0]), (*C.uchar)(&h[0]), (C.int)(len(h)), @@ -570,3 +574,4 @@ var invalidSignatureError = errors.New("input signature does not deserialize to func IsInvalidSignatureError(err error) bool { return errors.Is(err, invalidSignatureError) } +*/ diff --git a/crypto/bls_test.go b/crypto/bls_test.go index a9672b8eeb7..7a93dd04998 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -6,10 +6,10 @@ package crypto import ( "crypto/rand" "encoding/hex" - "fmt" + _ "fmt" mrand "math/rand" "testing" - "time" + _ "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -18,7 +18,7 @@ import ( ) // TestBLSMainMethods is a sanity check of main signature scheme methods (keyGen, sign, verify) -func TestBLSMainMethods(t *testing.T) { +/*func TestBLSMainMethods(t *testing.T) { // test the key generation seed lengths testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen) // test the consistency with different inputs @@ -63,7 +63,7 @@ func BenchmarkBLSBLS12381Sign(b *testing.B) { func BenchmarkBLSBLS12381Verify(b *testing.B) { halg := NewExpandMsgXOFKMAC128("bench tag") benchVerify(b, BLSBLS12381, halg) -} +}*/ // utility function to generate a random BLS private key func randomSK(t *testing.T, seed []byte) PrivateKey { @@ -122,14 +122,14 @@ func TestBLSBLS12381Hasher(t *testing.T) { h := internalExpandMsgXOFKMAC128(blsSigCipherSuite) assert.NotNil(t, h) }) - - t.Run("constants sanity check", func(t *testing.T) { - // test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve - // The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long - assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16) - assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16) - }) - + /* + t.Run("constants sanity check", func(t *testing.T) { + // test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve + // The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long + assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16) + assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16) + }) + */ t.Run("orthogonal PoP and signature hashing", func(t *testing.T) { data := []byte("random_data") // empty tag hasher @@ -214,6 +214,7 @@ func TestBLSUtils(t *testing.T) { testKeySize(t, sk, PrKeyLenBLSBLS12381, PubKeyLenBLSBLS12381) } +/* // BLS Proof of Possession test func TestBLSPOP(t *testing.T) { r := time.Now().UnixNano() @@ -1111,3 +1112,4 @@ func TestBLSIdentity(t *testing.T) { assert.False(t, valid) }) } +*/ diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index e6c21004193..094f4ebc692 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -3,6 +3,7 @@ package crypto +/* // #cgo CFLAGS: // #include "bls_thresholdsign_include.h" import "C" @@ -581,7 +582,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, for i := index(1); int(i) <= size; i++ { C.Fr_polynomialImage( (*C.Fr)(&x[i-1]), - (*C.ep2_st)(&y[i-1]), + (*C.G2)(&y[i-1]), (*C.Fr)(&a[0]), (C.int)(len(a)), (C.uint8_t)(i), ) @@ -603,3 +604,4 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, // generating an identity key is therefore negligible. return skShares, pkShares, pkGroup, nil } +*/ diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 0d7f7204a79..04fe28d4db4 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -3,6 +3,7 @@ package crypto +/* import ( "crypto/rand" "fmt" @@ -647,4 +648,4 @@ func BenchmarkSignatureReconstruction(b *testing.B) { require.NoError(b, err) } b.StopTimer() -} +}*/ diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 0ee8e99ddb2..d33ec372be6 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -7,7 +7,9 @@ // eventually this file would replace blst.h #include "point.h" +#include "fields.h" #include "consts.h" +#include "errors.h" #include "bls12381_utils.h" // types used by the Flow crypto library that are imported from BLST @@ -55,16 +57,9 @@ typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ # define DEFNULL #endif -typedef enum { - BLST_SUCCESS = 0, - BLST_BAD_ENCODING, - BLST_POINT_NOT_ON_CURVE, - BLST_POINT_NOT_IN_GROUP, - BLST_AGGR_TYPE_MISMATCH, - BLST_VERIFY_FAIL, - BLST_PK_IS_INFINITY, - BLST_BAD_SCALAR, -} BLST_ERROR; +// TODO: add sanity checks that BLST_PK_IS_INFINITY is indeed the last +// enum value (eventually submit a fix to BLST) +#define BLST_BAD_SCALAR ((BLST_PK_IS_INFINITY)+1) // field elements F_r // where `r` is the order of G1/G2. @@ -72,7 +67,8 @@ typedef enum { // are represented as a little endian vector of limbs. // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements). // `Fr` is defined as a struct to be exportable through cgo to the Go layer. -typedef struct {limb_t limbs[Fr_LIMBS];} Fr; +#define R_BITS +typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // TODO: use Fr_LIMBS // field elements F_p // F_p elements are represented as big numbers reduced modulo `p`. Big numbers @@ -86,19 +82,23 @@ typedef vec384 Fp; // where x, y, x are elements of F_p (type `Fp`). // `G1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements) // `G1` is defined as a struct to be exportable through cgo to the Go layer. -typedef struct {Fp x,y,z} G1; +typedef struct {Fp x,y,z;} G1; // field elements F_p^2 // F_p^2 elements are represented as a vector of two F_p elements. // `Fp2` is equivalent to type `vec384x` (used internally by BLST for F_p^2 elements). // `Fp2` does not need to be exported to cgo. -typedef vec384x Fp2; +typedef vec384x Fp2; +// helpers to get "real" and "imaginary" Fp elements from Fp2 pointers +#define real(p) ((*(p))[0]) +#define imag(p) ((*(p))[1]) + // Subroup G2 in E2 // G2 points are represented in Jacobian coordinates (x,y,z), // where x, y, x are elements of F_p (type `Fp`). // `G2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E1 elements) // `G2` is defined as a struct to be exportable through cgo to the Go layer. -typedef struct {Fp2 x,y,z} G2; +typedef struct {Fp2 x,y,z;} G2; #endif diff --git a/crypto/blst_src.c b/crypto/blst_src/blst_src.c similarity index 99% rename from crypto/blst_src.c rename to crypto/blst_src/blst_src.c index 89388b703fe..4b0732e06e4 100644 --- a/crypto/blst_src.c +++ b/crypto/blst_src/blst_src.c @@ -17,3 +17,4 @@ #include "consts.c" #include "vect.c" #include "exports.c" + diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c deleted file mode 100644 index df11e3dae73..00000000000 --- a/crypto/blst_src/client_min_pk.c +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -/*#include "keygen.c" -#include "e2.c" -#include "hash_to_field.c" -#include "map_to_g2.c" -#include "e1.c" -#include "exp.c" -#include "sqrt.c" -#include "recip.c" -#include "consts.c" -#include "vect.c" -#include "exports.c"*/ diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c deleted file mode 100644 index fffbd5ad52d..00000000000 --- a/crypto/blst_src/client_min_sig.c +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -/*#include "keygen.c" -#include "e1.c" -#include "hash_to_field.c" -#include "map_to_g1.c" -#include "e2.c" -#include "exp.c" -#include "sqrt.c" -#include "recip.c" -#include "consts.c" -#include "vect.c" -#include "exports.c"*/ diff --git a/crypto/dkg.go b/crypto/dkg.go index 1cdf87a128e..1254db615f3 100644 --- a/crypto/dkg.go +++ b/crypto/dkg.go @@ -1,5 +1,7 @@ package crypto +/* + import ( "errors" "fmt" @@ -235,3 +237,4 @@ type DKGProcessor interface { // log describes the misbehavior. FlagMisbehavior(participant int, log string) } +*/ diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 5fdd6db7c79..9bf9dd8b2fc 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -1,7 +1,7 @@ // +build relic #include "dkg_include.h" - +/* #define N_max 250 #define N_bits_max 8 // log(250) @@ -69,12 +69,12 @@ void G2_polynomialImages(ep2_st *y, const int len_y, const ep2_st* A, const int } // export an array of ep2_st into an array of bytes -// the length matching is supposed to be checked +// the array must be of length (len * G2_SER_BYTES) void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len) { const int size = (G2_BYTES/(G2_SERIALIZATION+1)); byte* p = out; for (int i=0; i Date: Thu, 13 Apr 2023 17:50:31 -0600 Subject: [PATCH 027/200] G2 type working for BLS --- crypto/bls.go | 2 +- crypto/bls12381_utils.c | 69 +++++-- crypto/bls12381_utils.go | 2 +- crypto/bls12381_utils.h | 3 + crypto/bls_core.c | 4 +- crypto/bls_test.go | 4 +- crypto/blst_include.h | 4 +- crypto/sign_test_utils.go | 396 +++++++++++++++++++------------------- 8 files changed, 269 insertions(+), 215 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 66f4c809e85..65113f873ba 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -348,7 +348,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err var pk pubKeyBLSBLS12381 err := readPointG2(&pk.point, publicKeyBytes) if err != nil { - return nil, fmt.Errorf("decode public key failed %w", err) + return nil, fmt.Errorf("decode public key failed: %w", err) } // membership check in G2 diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index b66be0932a2..3b643933366 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -493,7 +493,7 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) { // returns the sign of y. // 1 if y > (p - 1)/2 and 0 otherwise. // y is in montgomery form -static limb_t Fp_get_sign(const fp_t y) { +static byte Fp_get_sign(const fp_t y) { return sgn0_pty_mont_384(y, BLS12_381_P, p0); } @@ -530,15 +530,19 @@ void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { // checks if `a` is a quadratic residue in Fp^2. If yes, it computes // the square root in `res`. +// +// The boolean output is valid whether `a` is in Montgomery form or not, +// since montgomery constant `R` is a quadratic residue. +// However, the square root is valid only if `a` is in montgomery form. static bool_t Fp2_sqrt(Fp2 *res, const Fp2* a) { return sqrt_fp2((vec384*)res, (vec384*)a); } // returns the sign of y. // sign(y_0) if y_1 = 0, else sign(y_1) -// y coordinates are in montgommery form -static limb_t Fp2_get_sign(Fp2* y) { - return sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0); +// y coordinates must be in montgomery form +static byte Fp2_get_sign(Fp2* y) { + return (sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0)>>1) & 1; } // reads an Fp^2 element in `a`. @@ -595,7 +599,7 @@ int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { } // check if the point is infinity - int is_infinity = bin[0] & 0x40; + int is_infinity = bin[0] & (1<<6); if (is_infinity) { // check if the remaining bits are cleared if (bin[0] & 0x3F) { @@ -645,6 +649,15 @@ int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { return RLC_ERR; } + +// TODO: delete aftet deleting ep_write_bin_compact +static int fp_get_sign(const fp_t y) { + bn_t bn_y; + bn_new(bn_y); + fp_prime_back(bn_y, y); + return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT; +} + // ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form. // len is the allocated size of the buffer bin. // The serialization is following: @@ -660,7 +673,7 @@ void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { if (ep_is_infty(a)) { // set the infinity bit - bin[0] = (G1_SERIALIZATION << 7) | 0x40; + bin[0] = (G1_SERIALIZATION << 7) | (1<<6); memset(bin+1, 0, G1_size-1); return; } @@ -673,7 +686,7 @@ void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { fp_write_bin(bin, Fp_BYTES, t->x); if (G1_SERIALIZATION == COMPRESSED) { - bin[0] |= (Fp_get_sign(t->y) << 5); + bin[0] |= (fp_get_sign(t->y) << 5); } else { fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y); } @@ -881,11 +894,11 @@ BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) { Fp2* a_y = &(a->y); Fp2_squ_montg(a_y, a_x); Fp2_mul_montg(a_y, a_y, a_x); - Fp2_add(a_y, a_y, &B_E2); - if (!Fp2_sqrt(a_y, a_y)) // if (y^2 = x^3+b) has no solution in y + Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form + if (!Fp2_sqrt(a_y, a_y)) // check whether x^3+b is a quadratic residue return BLST_POINT_NOT_ON_CURVE; - // resulting (x,y) is guaranteed to be on curve + // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) if (Fp2_get_sign(a_y) != y_sign) { Fp2_neg(a_y, a_y); // flip y sign if needed } @@ -900,13 +913,13 @@ BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) { void E2_write_bytes(byte *bin, const G2* a) { if (E2_is_infty(a)) { // set the infinity bit - bin[0] = (G2_SERIALIZATION << 7) | 0x40; + bin[0] = (G2_SERIALIZATION << 7) | (1 << 6); memset(bin+1, 0, G2_SER_BYTES-1); return; } - G2 tmp; E2_to_affine(&tmp, a); + Fp2* t_x = &(tmp.x); Fp_from_montg(&real(t_x), &real(t_x)); Fp_from_montg(&imag(t_x), &imag(t_x)); @@ -916,6 +929,8 @@ void E2_write_bytes(byte *bin, const G2* a) { if (G2_SERIALIZATION == COMPRESSED) { bin[0] |= (Fp2_get_sign(t_y) << 5); } else { + Fp_from_montg(&real(t_y), &real(t_y)); + Fp_from_montg(&imag(t_y), &imag(t_y)); Fp2_write_bytes(bin + Fp2_BYTES, t_y); } @@ -1275,6 +1290,36 @@ void Fr_print_(char* s, Fr* a) { printf("%16llx", *(--p)); printf("\n"); } + +void Fp_print_(char* s, Fp* a) { + printf("[%s]:\n", s); + limb_t* p = (limb_t*)(a) + Fp_LIMBS; + for (int i=0; ix)); + Fp2_print_(".y", &(a->y)); + Fp2_print_(".z", &(a->z)); +} void fp_print_(char* s, fp_st a) { diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 9d59eb8d7d4..2297e434c2f 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -119,7 +119,7 @@ func (x *scalar) isZero() bool { // Comparison to point at infinity in G2. func (p *pointG2) isInfinity() bool { - return C.E2_is_infty((*C.G2)(p)) != 10 + return C.E2_is_infty((*C.G2)(p)) != 0 } // returns a random element of Fr in input pointer diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index a6688c5871d..521941c7fee 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -166,6 +166,9 @@ void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int); // Debugging related functions void bytes_print_(char*, byte*, int); void Fr_print_(char*, Fr*); +void Fp_print_(char*, Fp*); +void Fp2_print_(char*, const Fp2*); +void E2_print_(char*, const G2*); void fp_print_(char*, fp_t); void bn_print_(char*, bn_st*); void ep_print_(char*, ep_st*); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 03fa21ca782..cfabca52719 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -101,7 +101,7 @@ static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int // elemsG2[1] = pk ep2_new(elemsG2[1]); - ep2_copy(elemsG2[1], (ep2_st*)pk_tmp); + ep2_copy(elemsG2[1], pk_tmp); #if DOUBLE_PAIRING // elemsG2[0] = -g2 @@ -336,11 +336,13 @@ int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) { // deserialize the signature into a curve point int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN); if (read_ret != RLC_OK) { + printf("HHH1\n"); return read_ret; } // check s is in G1 if (check_membership_G1(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1 + printf("HHH2\n"); return INVALID; } diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 7a93dd04998..2965326fb66 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -18,7 +18,7 @@ import ( ) // TestBLSMainMethods is a sanity check of main signature scheme methods (keyGen, sign, verify) -/*func TestBLSMainMethods(t *testing.T) { +func TestBLSMainMethods(t *testing.T) { // test the key generation seed lengths testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen) // test the consistency with different inputs @@ -63,7 +63,7 @@ func BenchmarkBLSBLS12381Sign(b *testing.B) { func BenchmarkBLSBLS12381Verify(b *testing.B) { halg := NewExpandMsgXOFKMAC128("bench tag") benchVerify(b, BLSBLS12381, halg) -}*/ +} // utility function to generate a random BLS private key func randomSK(t *testing.T, seed []byte) PrivateKey { diff --git a/crypto/blst_include.h b/crypto/blst_include.h index d33ec372be6..c480a68d27e 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -6,11 +6,11 @@ // extra tools to use BLST low level that are needed by the Flow crypto library // eventually this file would replace blst.h +#include "bls12381_utils.h" #include "point.h" #include "fields.h" #include "consts.h" #include "errors.h" -#include "bls12381_utils.h" // types used by the Flow crypto library that are imported from BLST // these type definitions are used as an abstraction from BLST internal types @@ -67,7 +67,7 @@ typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ // are represented as a little endian vector of limbs. // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements). // `Fr` is defined as a struct to be exportable through cgo to the Go layer. -#define R_BITS +#define R_BITS 255 typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // TODO: use Fr_LIMBS // field elements F_p diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go index e9198a0c7b5..8a81e5bb45a 100644 --- a/crypto/sign_test_utils.go +++ b/crypto/sign_test_utils.go @@ -47,62 +47,62 @@ func TestHasherErrors(t *testing.T) { // tests sign and verify are consistent for multiple generated keys and messages func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { - t.Logf("Testing Generation/Signature/Verification for %s", salg) - // make sure the length is larger than minimum lengths of all the signaure algos - seedMinLength := 48 - seed := make([]byte, seedMinLength) - input := make([]byte, 100) - r := time.Now().UnixNano() - mrand.Seed(r) - t.Logf("math rand seed is %d", r) - - loops := 50 - for j := 0; j < loops; j++ { - n, err := mrand.Read(seed) - require.Equal(t, n, seedMinLength) - require.NoError(t, err) - sk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - _, err = mrand.Read(input) - require.NoError(t, err) - s, err := sk.Sign(input, halg) - require.NoError(t, err) - pk := sk.PublicKey() + t.Run(fmt.Sprintf("Testing Generation/Signature/Verification for %s", salg), func(t *testing.T) { + seed := make([]byte, KeyGenSeedMinLen) + input := make([]byte, 100) + r := time.Now().UnixNano() + mrand.Seed(r) + t.Logf("math rand seed is %d", r) - // test a valid signature - result, err := pk.Verify(s, input, halg) - require.NoError(t, err) - assert.True(t, result, fmt.Sprintf( - "Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + loops := 50 + for j := 0; j < loops; j++ { + n, err := mrand.Read(seed) + require.Equal(t, n, KeyGenSeedMinLen) + require.NoError(t, err) + sk, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + _, err = mrand.Read(input) + require.NoError(t, err) + s, err := sk.Sign(input, halg) + require.NoError(t, err) + pk := sk.PublicKey() - // test with a different message - input[0] ^= 1 - result, err = pk.Verify(s, input, halg) - require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) - input[0] ^= 1 + // test a valid signature + result, err := pk.Verify(s, input, halg) + require.NoError(t, err) + assert.True(t, result, fmt.Sprintf( + "Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) - // test with a valid but different key - seed[0] ^= 1 - wrongSk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - result, err = wrongSk.PublicKey().Verify(s, input, halg) - require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + // test with a different message + input[0] ^= 1 + result, err = pk.Verify(s, input, halg) + require.NoError(t, err) + assert.False(t, result, fmt.Sprintf( + "Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + input[0] ^= 1 + + // test with a valid but different key + seed[0] ^= 1 + wrongSk, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + result, err = wrongSk.PublicKey().Verify(s, input, halg) + require.NoError(t, err) + assert.False(t, result, fmt.Sprintf( + "Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + + // test a wrong signature length + invalidLen := mrand.Intn(2 * len(s)) // try random invalid lengths + if invalidLen == len(s) { // map to an invalid length + invalidLen = 0 + } + invalidSig := make([]byte, invalidLen) + result, err = pk.Verify(invalidSig, input, halg) + require.NoError(t, err) + assert.False(t, result, fmt.Sprintf( + "Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen)) - // test a wrong signature length - invalidLen := mrand.Intn(2 * len(s)) // try random invalid lengths - if invalidLen == len(s) { // map to an invalid length - invalidLen = 0 } - invalidSig := make([]byte, invalidLen) - result, err = pk.Verify(invalidSig, input, halg) - require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen)) - } + }) } // tests the key generation constraints with regards to the input seed, mainly @@ -154,167 +154,171 @@ func testKeyGenSeed(t *testing.T, salg SigningAlgorithm, minLen int, maxLen int) } func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { - t.Logf("Testing encode/decode for %s", salg) - r := time.Now().UnixNano() - mrand.Seed(r) - t.Logf("math rand seed is %d", r) - // make sure the length is larger than minimum lengths of all the signaure algos - seedMinLength := 48 - - t.Run("happy path tests", func(t *testing.T) { - loops := 50 - for j := 0; j < loops; j++ { - // generate a private key - seed := make([]byte, seedMinLength) - read, err := mrand.Read(seed) - require.Equal(t, read, seedMinLength) - require.NoError(t, err) - sk, err := GeneratePrivateKey(salg, seed) - assert.Nil(t, err, "the key generation failed") - seed[0] ^= 1 // alter the seed to get a new private key - distinctSk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - - // check private key encoding - skBytes := sk.Encode() - skCheck, err := DecodePrivateKey(salg, skBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, sk.Equals(skCheck), "key equality check failed") - skCheckBytes := skCheck.Encode() - assert.Equal(t, skBytes, skCheckBytes, "keys should be equal") - distinctSkBytes := distinctSk.Encode() - assert.NotEqual(t, skBytes, distinctSkBytes, "keys should be different") - - // check public key encoding - pk := sk.PublicKey() - pkBytes := pk.Encode() - pkCheck, err := DecodePublicKey(salg, pkBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, pk.Equals(pkCheck), "key equality check failed") - pkCheckBytes := pkCheck.Encode() - assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal") - distinctPkBytes := distinctSk.PublicKey().Encode() - assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different") - - // same for the compressed encoding - pkComprBytes := pk.EncodeCompressed() - pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, pk.Equals(pkComprCheck), "key equality check failed") - pkCheckComprBytes := pkComprCheck.EncodeCompressed() - assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal") - distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed() - assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different") - } - }) - - // test invalid private keys (equal to the curve group order) - t.Run("private keys equal to the group order", func(t *testing.T) { - groupOrder := make(map[SigningAlgorithm][]byte) - groupOrder[ECDSAP256] = []byte{255, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255, - 255, 255, 255, 255, 255, 188, 230, 250, 173, 167, - 23, 158, 132, 243, 185, 202, 194, 252, 99, 37, 81} + t.Run(fmt.Sprintf("Testing encode/decode for %s", salg), func(t *testing.T) { + r := time.Now().UnixNano() + mrand.Seed(r) + t.Logf("math rand seed is %d", r) - groupOrder[ECDSASecp256k1] = []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 254, 186, 174, 220, 230, - 175, 72, 160, 59, 191, 210, 94, 140, 208, 54, 65, 65} + t.Run("happy path tests", func(t *testing.T) { + loops := 50 + for j := 0; j < loops; j++ { + // generate a private key + seed := make([]byte, KeyGenSeedMinLen) + read, err := mrand.Read(seed) + require.Equal(t, read, KeyGenSeedMinLen) + require.NoError(t, err) + sk, err := GeneratePrivateKey(salg, seed) + assert.Nil(t, err, "the key generation failed") + seed[0] ^= 1 // alter the seed to get a new private key + distinctSk, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + + // check private key encoding + skBytes := sk.Encode() + skCheck, err := DecodePrivateKey(salg, skBytes) + require.Nil(t, err, "the key decoding failed") + assert.True(t, sk.Equals(skCheck), "key equality check failed") + skCheckBytes := skCheck.Encode() + assert.Equal(t, skBytes, skCheckBytes, "keys should be equal") + distinctSkBytes := distinctSk.Encode() + assert.NotEqual(t, skBytes, distinctSkBytes, "keys should be different") + + // check public key encoding + pk := sk.PublicKey() + pkBytes := pk.Encode() + pkCheck, err := DecodePublicKey(salg, pkBytes) + require.Nil(t, err, "the key decoding failed") + assert.True(t, pk.Equals(pkCheck), "key equality check failed") + pkCheckBytes := pkCheck.Encode() + assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal") + distinctPkBytes := distinctSk.PublicKey().Encode() + assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different") + + // same for the compressed encoding + pkComprBytes := pk.EncodeCompressed() + pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes) + require.Nil(t, err, "the key decoding failed") + assert.True(t, pk.Equals(pkComprCheck), "key equality check failed") + pkCheckComprBytes := pkComprCheck.EncodeCompressed() + assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal") + distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed() + assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different") + } + }) + + // test invalid private keys (equal to the curve group order) + + t.Run("private keys equal to the group order", func(t *testing.T) { + groupOrder := make(map[SigningAlgorithm][]byte) + groupOrder[ECDSAP256] = []byte{255, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255, + 255, 255, 255, 255, 255, 188, 230, 250, 173, 167, + 23, 158, 132, 243, 185, 202, 194, 252, 99, 37, 81} + + groupOrder[ECDSASecp256k1] = []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 254, 186, 174, 220, 230, + 175, 72, 160, 59, 191, 210, 94, 140, 208, 54, 65, 65} + + groupOrder[BLSBLS12381] = []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39, + 0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE, + 0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01} + + sk, err := DecodePrivateKey(salg, groupOrder[salg]) + require.Error(t, err, "the key decoding should fail - private key value is too large") + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, sk) + }) - groupOrder[BLSBLS12381] = []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39, - 0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE, - 0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01} + // test invalid private and public keys (invalid length) - sk, err := DecodePrivateKey(salg, groupOrder[salg]) - require.Error(t, err, "the key decoding should fail - private key value is too large") - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) - }) + t.Run("invalid key length", func(t *testing.T) { + // private key + skLens := make(map[SigningAlgorithm]int) + skLens[ECDSAP256] = PrKeyLenECDSAP256 + skLens[ECDSASecp256k1] = PrKeyLenECDSASecp256k1 + skLens[BLSBLS12381] = 32 - // test invalid private and public keys (invalid length) - t.Run("invalid key length", func(t *testing.T) { - // private key - skLens := make(map[SigningAlgorithm]int) - skLens[ECDSAP256] = PrKeyLenECDSAP256 - skLens[ECDSASecp256k1] = PrKeyLenECDSASecp256k1 - skLens[BLSBLS12381] = 32 - - bytes := make([]byte, skLens[salg]+1) - sk, err := DecodePrivateKey(salg, bytes) - require.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) + bytes := make([]byte, skLens[salg]+1) + sk, err := DecodePrivateKey(salg, bytes) + require.Error(t, err) + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, sk) - // public key - pkLens := make(map[SigningAlgorithm]int) - pkLens[ECDSAP256] = PubKeyLenECDSAP256 - pkLens[ECDSASecp256k1] = PubKeyLenECDSASecp256k1 - pkLens[BLSBLS12381] = 96 + // public key + pkLens := make(map[SigningAlgorithm]int) + pkLens[ECDSAP256] = PubKeyLenECDSAP256 + pkLens[ECDSASecp256k1] = PubKeyLenECDSASecp256k1 + pkLens[BLSBLS12381] = 96 - bytes = make([]byte, pkLens[salg]+1) - pk, err := DecodePublicKey(salg, bytes) - require.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, pk) + bytes = make([]byte, pkLens[salg]+1) + pk, err := DecodePublicKey(salg, bytes) + require.Error(t, err) + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, pk) + }) }) } func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorithm) { - t.Logf("Testing Equals for %s", salg) - r := time.Now().UnixNano() - mrand.Seed(r) - t.Logf("math rand seed is %d", r) - // make sure the length is larger than minimum lengths of all the signaure algos - seedMinLength := 48 - - // generate a key pair - seed := make([]byte, seedMinLength) - n, err := mrand.Read(seed) - require.Equal(t, n, seedMinLength) - require.NoError(t, err) - - // first pair - sk1, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk1 := sk1.PublicKey() - - // second pair without changing the seed - sk2, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk2 := sk2.PublicKey() - - // unrelated algo pair - sk3, err := GeneratePrivateKey(otherSigAlgo, seed) - require.NoError(t, err) - pk3 := sk3.PublicKey() - - // fourth pair with same algo but a different seed - seed[0] ^= 1 - sk4, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk4 := sk4.PublicKey() - - // tests - assert.True(t, sk1.Equals(sk2), "key equality should return true") - assert.True(t, pk1.Equals(pk2), "key equality should return true") - assert.False(t, sk1.Equals(sk3), "key equality should return false") - assert.False(t, pk1.Equals(pk3), "key equality should return false") - assert.False(t, sk1.Equals(sk4), "key equality should return false") - assert.False(t, pk1.Equals(pk4), "key equality should return false") + t.Run(fmt.Sprintf("Testing Equals for %s", salg), func(t *testing.T) { + r := time.Now().UnixNano() + mrand.Seed(r) + t.Logf("math rand seed is %d", r) + // make sure the length is larger than minimum lengths of all the signaure algos + seedMinLength := 48 + + // generate a key pair + seed := make([]byte, seedMinLength) + n, err := mrand.Read(seed) + require.Equal(t, n, seedMinLength) + require.NoError(t, err) + + // first pair + sk1, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + pk1 := sk1.PublicKey() + + // second pair without changing the seed + sk2, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + pk2 := sk2.PublicKey() + + // unrelated algo pair + sk3, err := GeneratePrivateKey(otherSigAlgo, seed) + require.NoError(t, err) + pk3 := sk3.PublicKey() + + // fourth pair with same algo but a different seed + seed[0] ^= 1 + sk4, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + pk4 := sk4.PublicKey() + + // tests + assert.True(t, sk1.Equals(sk2), "key equality should return true") + assert.True(t, pk1.Equals(pk2), "key equality should return true") + assert.False(t, sk1.Equals(sk3), "key equality should return false") + assert.False(t, pk1.Equals(pk3), "key equality should return false") + assert.False(t, sk1.Equals(sk4), "key equality should return false") + assert.False(t, pk1.Equals(pk4), "key equality should return false") + }) } func testKeysAlgorithm(t *testing.T, sk PrivateKey, salg SigningAlgorithm) { - t.Logf("Testing key.Algorithm for %s", salg) - alg := sk.Algorithm() - assert.Equal(t, alg, salg) - alg = sk.PublicKey().Algorithm() - assert.Equal(t, alg, salg) + t.Run(fmt.Sprintf("Testing key.Algorithm for %s", salg), func(t *testing.T) { + alg := sk.Algorithm() + assert.Equal(t, alg, salg) + alg = sk.PublicKey().Algorithm() + assert.Equal(t, alg, salg) + }) } func testKeySize(t *testing.T, sk PrivateKey, skLen int, pkLen int) { - t.Logf("Testing key.Size for %s", sk.Algorithm()) - size := sk.Size() - assert.Equal(t, size, skLen) - size = sk.PublicKey().Size() - assert.Equal(t, size, pkLen) + t.Run(fmt.Sprintf("Testing key.Size for %s", sk.Algorithm()), func(t *testing.T) { + size := sk.Size() + assert.Equal(t, size, skLen) + size = sk.PublicKey().Size() + assert.Equal(t, size, pkLen) + }) } func benchVerify(b *testing.B, algo SigningAlgorithm, halg hash.Hasher) { From 25b7be676700f50b82de352c7e201203f67f76b5 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 14 Apr 2023 12:51:46 -0600 Subject: [PATCH 028/200] integrate G2 in BLS multi-sig --- crypto/bls12381_utils.c | 60 +++++++++++++++++++----------- crypto/bls12381_utils.h | 8 +++- crypto/bls12381_utils_test.go | 1 - crypto/bls_core.c | 69 ++++++++++++++++++++--------------- crypto/bls_include.h | 6 +-- crypto/bls_multisig.go | 13 ++----- crypto/bls_test.go | 26 ++++++------- crypto/build_dependency.sh | 2 +- crypto/dkg_jointfeldman.go | 4 +- crypto/relic_build.sh | 4 +- 10 files changed, 108 insertions(+), 85 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 3b643933366..423a0a890b0 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -142,6 +142,16 @@ bn_st* Fr_blst_to_relic(const Fr* x) { return out; } +// TODO: temp utility function to delete +Fr* Fr_relic_to_blst(const bn_st* x){ + Fr* out = (Fr*)malloc(sizeof(Fr)); + byte* data = (byte*)malloc(Fr_BYTES); + bn_write_bin(data, Fr_BYTES, x); + Fr_read_bytes(out, data, Fr_BYTES); + free(data); + return out; +} + // returns true if a == 0 and false otherwise bool_t Fr_is_zero(const Fr* a) { return bytes_are_zero((const byte*)a, Fr_BYTES); @@ -159,7 +169,7 @@ void Fr_set_limb(Fr* a, const limb_t l){ } void Fr_copy(Fr* res, const Fr* a) { - vec_copy((byte*)res, (byte*)a, Fr_BYTES); + vec_copy((byte*)res, (byte*)a, sizeof(Fr)); } // sets `a` to 0 @@ -383,7 +393,7 @@ void Fp_set_limb(Fp* a, const limb_t l){ } void Fp_copy(Fp* res, const Fp* a) { - vec_copy((byte*)res, (byte*)a, Fp_BYTES); + vec_copy((byte*)res, (byte*)a, sizeof(Fr)); } static void Fp_add(Fp *res, const Fp *a, const Fp *b) { @@ -960,30 +970,41 @@ bool_t E2_is_equal(const G2* p1, const G2* p2) { return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2); } +// res = p +void E2_copy(G2* res, const G2* p) { + vec_copy(res, p, sizeof(G2)); +} + // converts an E2 point from Jacobian into affine coordinates (z=1) void E2_to_affine(G2* res, const G2* p) { // minor optimization in case coordinates are already affine if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) { - vec_copy(res, p, G2_BYTES); + E2_copy(res, p); return; } // convert from Jacobian POINTonE2_from_Jacobian((POINTonE2*)res, (const POINTonE2*)p); } +// generic point addition that must handle doubling and points at infinity void E2_add(G2* res, const G2* a, const G2* b) { POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); } -// Exponentiation of a generic point p in G2 -void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo) { - bn_st* tmp_expo = Fr_blst_to_relic(expo); - // Using window NAF of size 2 - ep2_mul_lwnaf(res, p, tmp_expo); - free(tmp_expo); +// Point negation in place. +// no need for an api of the form E2_neg(G2* res, const G2* a) for now +static void E2_neg(G2* a) { + POINTonE2_cneg((POINTonE2*)a, 1); +} + +// Exponentiation of a generic point `a` in E2, res = expo.a +void E2_mult(G2* res, const G2* a, const Fr* expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE2_sign((POINTonE2*)res, (POINTonE2*)a, tmp); } -// Exponentiation of generator g2 in G2 +// Exponentiation of generator g2 of G2, res = expo.g2 void G2_mult_gen(G2* res, const Fr* expo) { pow256 tmp; pow256_from_Fr(tmp, expo); @@ -991,14 +1012,11 @@ void G2_mult_gen(G2* res, const Fr* expo) { } // computes the sum of the G2 array elements y and writes the sum in jointy -void ep2_sum_vector(ep2_t jointy, ep2_st* y, const int len){ - ep2_set_infty(jointy); +void E2_sum_vector(G2* jointy, const G2* y, const int len){ + E2_set_infty(jointy); for (int i=0; ipk, root->sig, data, data_len); @@ -460,20 +463,22 @@ static void bls_batchVerify_tree(const node* root, const int len, byte* results, } // Batch verifies the validity of a multiple BLS signatures of the -// same message under multiple public keys. +// same message under multiple public keys. Each signature at index `i` is verified +// against the public key at index `i`. // // - membership checks of all signatures is verified upfront. -// - use random coefficients for signatures and public keys at the same index. +// - use random coefficients for signatures and public keys at the same index to prevent +// indices mixup. // - optimize the verification by verifying an aggregated signature against an aggregated // public key, and use a recursive verification to find invalid signatures. -void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input, +void bls_batchVerify(const int sigs_len, byte* results, const G2* pks_input, const byte* sigs_bytes, const byte* data, const int data_len) { - + // initialize results to undefined memset(results, UNDEFINED, sigs_len); // build the arrays of G1 and G2 elements to verify - ep2_st* pks = (ep2_st*) malloc(sigs_len * sizeof(ep2_st)); + G2* pks = (G2*) malloc(sigs_len * sizeof(G2)); if (!pks) return; ep_st* sigs = (ep_st*) malloc(sigs_len * sizeof(ep_st)); if (!sigs) goto out_sigs; @@ -489,24 +494,30 @@ void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input, // the tree aggregations remain valid. // - valid points are multiplied by a random scalar (same for public keys at same index) // to make sure a signature at index (i) is verified against the public key at the same index. + + // choose a random non-zero coefficient of at least 128 bits + // TODO: find a way to generate randoms + bn_rand(r, RLC_POS, SEC_BITS); + bn_add_dig(r, r, 1); + Fr* tmp = Fr_relic_to_blst(r); + // multiply public key by the random exponent + E2_mult(&pks[i], &pks_input[i], tmp); + int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN); - if ( read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) { - if (read_ret == UNDEFINED) // unexpected error case + if (read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) { + if (read_ret == UNDEFINED) {// unexpected error case goto out; - // set signature as infinity and set result as invald - ep_set_infty(&sigs[i]); - ep2_copy(&pks[i], (ep2_st*) &pks_input[i]); - results[i] = INVALID; - // multiply signatures and public keys at the same index by random coefficients + }; + // set signature as infinity and set result as invalid + // this result won't be overwritten + ep_set_infty(&sigs[i]); + results[i] = INVALID; } else { - // random non-zero coefficient of a least 128 bits - bn_rand(r, RLC_POS, SEC_BITS); - bn_add_dig(r, r, 1); - ep_mul_lwnaf(&sigs[i], &sigs[i], r); - ep2_mul_lwnaf(&pks[i], (ep2_st*) &pks_input[i], r); + // multiply the signature by the same random exponent + ep_mul_lwnaf(&sigs[i], &sigs[i], r); } + free(tmp); } - // build a binary tree of aggreagtions node* root = build_tree(sigs_len, &pks[0], &sigs[0]); if (!root) goto out; @@ -515,15 +526,13 @@ void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input, bls_batchVerify_tree(root, sigs_len, &results[0], data, data_len); // free the allocated tree free_tree(root); - + out: bn_free(r); for (int i=0; i < sigs_len; i++) { ep_free(sigs[i]); - ep2_free(pks[i]); } free(sigs); out_sigs: free(pks); } -*/ \ No newline at end of file diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 25bdf2020a7..32b9f506c8c 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -38,11 +38,11 @@ int get_sk_len(); void bls_sign(byte*, const Fr*, const byte*, const int); int bls_verify(const G2*, const byte*, const byte*, const int); int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, - const uint32_t*, const ep2_st*); + const uint32_t*, const G2*); int bls_verifyPerDistinctKey(const byte*, - const int, const ep2_st*, const uint32_t*, + const int, const G2*, const uint32_t*, const byte*, const uint32_t*); -void bls_batchVerify(const int, byte*, const ep2_st*, +void bls_batchVerify(const int, byte*, const G2*, const byte*, const byte*, const int); #endif diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index cf293726112..e9139183c3f 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -3,7 +3,6 @@ package crypto -/* import ( "errors" "fmt" @@ -14,7 +13,7 @@ import ( "github.com/onflow/flow-go/crypto/hash" _ "github.com/onflow/flow-go/crypto/hash" -)*/ +) // BLS multi-signature using BLS12-381 curve // ([zcash]https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#bls12-381) @@ -44,7 +43,6 @@ import "C" // used for signatures. var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite) -/* // BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key. // // The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used @@ -195,13 +193,13 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { } var sum pointG2 - C.ep2_sum_vector((*C.G2)(&sum), (*C.G2)(&points[0]), + C.E2_sum_vector((*C.G2)(&sum), (*C.G2)(&points[0]), (C.int)(len(points))) sumKey := newPubKeyBLSBLS12381(&sum) return sumKey, nil } -*/ + // IdentityBLSPublicKey returns an identity public key which corresponds to the point // at infinity in G2 (identity element of G2). // TODO: return a constant key instead of a newly allocated one @@ -214,8 +212,6 @@ func IdentityBLSPublicKey() PublicKey { return &identity } -/* - // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key. // // The common use case assumes the aggregated public key was initially formed using @@ -252,7 +248,7 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, } var resultPoint pointG2 - C.ep2_subtract_vector((*C.G2)(&resultPoint), (*C.G2)(&aggPKBLS.point), + C.E2_subtract_vector((*C.G2)(&resultPoint), (*C.G2)(&aggPKBLS.point), (*C.G2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract))) resultKey := newPubKeyBLSBLS12381(&resultPoint) @@ -574,4 +570,3 @@ var invalidSignatureError = errors.New("input signature does not deserialize to func IsInvalidSignatureError(err error) bool { return errors.Is(err, invalidSignatureError) } -*/ diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 2965326fb66..ad29b088481 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -6,10 +6,10 @@ package crypto import ( "crypto/rand" "encoding/hex" - _ "fmt" + "fmt" mrand "math/rand" "testing" - _ "time" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -122,14 +122,14 @@ func TestBLSBLS12381Hasher(t *testing.T) { h := internalExpandMsgXOFKMAC128(blsSigCipherSuite) assert.NotNil(t, h) }) - /* - t.Run("constants sanity check", func(t *testing.T) { - // test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve - // The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long - assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16) - assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16) - }) - */ + + t.Run("constants sanity check", func(t *testing.T) { + // test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve + // The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long + assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16) + assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16) + }) + t.Run("orthogonal PoP and signature hashing", func(t *testing.T) { data := []byte("random_data") // empty tag hasher @@ -214,7 +214,6 @@ func TestBLSUtils(t *testing.T) { testKeySize(t, sk, PrKeyLenBLSBLS12381, PubKeyLenBLSBLS12381) } -/* // BLS Proof of Possession test func TestBLSPOP(t *testing.T) { r := time.Now().UnixNano() @@ -651,9 +650,9 @@ func TestBLSBatchVerify(t *testing.T) { t.Run("one valid signature", func(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:1], sigs[:1], input, kmac) require.NoError(t, err) - assert.Equal(t, valid, expectedValid[:1], + assert.Equal(t, expectedValid[:1], valid, "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs, sks, input, valid) + sigs[:1], sks[:1], input, valid) }) // pick a random number of invalid signatures @@ -1112,4 +1111,3 @@ func TestBLSIdentity(t *testing.T) { assert.False(t, valid) }) } -*/ diff --git a/crypto/build_dependency.sh b/crypto/build_dependency.sh index bd5d612e9cb..4bfe99dbad2 100644 --- a/crypto/build_dependency.sh +++ b/crypto/build_dependency.sh @@ -14,7 +14,7 @@ fi rm -rf "${RELIC_DIR}" # relic version or tag -relic_version="05feb20da8507260c9b3736dc1fd2efe7876d812" +relic_version="7d885d1ba34be61bf22190943a73549a910c1714" # clone a specific version of Relic without history if it's tagged. # git -c http.sslVerify=true clone --branch $(relic_version) --single-branch --depth 1 https://github.com/relic-toolkit/relic.git ${RELIC_DIR_NAME} || { echo "git clone failed"; exit 1; } diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index 7ee0a9773d5..be62d3f5a73 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -309,12 +309,12 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2 (C.int)(qualified)) // sum up Y var jointPublicKey pointG2 - C.ep2_sum_vector((*C.G2)(&jointPublicKey), + C.E2_sum_vector((*C.G2)(&jointPublicKey), (*C.G2)(&qualifiedPubKey[0]), (C.int)(qualified)) // sum up []y jointy := make([]pointG2, s.size) for i := 0; i < s.size; i++ { - C.ep2_sum_vector((*C.G2)(&jointy[i]), + C.E2_sum_vector((*C.G2)(&jointy[i]), (*C.G2)(&qualifiedy[i][0]), (C.int)(qualified)) } return &jointx, &jointPublicKey, jointy diff --git a/crypto/relic_build.sh b/crypto/relic_build.sh index 3045e22f59e..62f21ec5db5 100755 --- a/crypto/relic_build.sh +++ b/crypto/relic_build.sh @@ -63,9 +63,9 @@ PRIME=(-DFP_PRIME=381) # BN_METH=(-DBN_KARAT=0 -DBN_METHD="COMBA;COMBA;MONTY;SLIDE;BINAR;BASIC") FP_METH=(-DFP_KARAT=0 -DFP_METHD="INTEG;INTEG;INTEG;MONTY;MONTY;JMPDS;SLIDE") -PRIMES=(-DFP_PMERS=OFF -DFP_QNRES=ON -DFP_WIDTH=2) +PRIMES=(-DFP_PMERS=OFF -DFP_QNRES=ON) FPX_METH=(-DFPX_METHD="INTEG;INTEG;LAZYR") -EP_METH=(-DEP_MIXED=ON -DEP_PLAIN=OFF -DEP_ENDOM=ON -DEP_SUPER=OFF -DEP_DEPTH=4 -DEP_WIDTH=2 \ +EP_METH=(-DEP_MIXED=ON -DEP_PLAIN=OFF -DEP_ENDOM=ON -DEP_SUPER=OFF \ -DEP_CTMAP=ON -DEP_METHD="JACOB;LWNAF;COMBS;INTER") PP_METH=(-DPP_METHD="LAZYR;OATEP") From 444f75520509fd14945b25ec3fd855f8969ba0fd Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 14 Apr 2023 21:46:03 -0600 Subject: [PATCH 029/200] update BLSBatchVerify with regards to invalid signature format --- crypto/bls12381_utils.c | 12 +++++------ crypto/bls_core.c | 26 +++++++++++------------- crypto/bls_multisig.go | 45 ++++++++++++++++++++++++++--------------- crypto/bls_test.go | 30 ++++++++++++++++++++++++++- 4 files changed, 76 insertions(+), 37 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 423a0a890b0..83569661ab1 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -174,7 +174,7 @@ void Fr_copy(Fr* res, const Fr* a) { // sets `a` to 0 void Fr_set_zero(Fr* a){ - vec_zero((byte*)a, Fr_BYTES); + vec_zero((byte*)a, sizeof(Fr)); } void Fr_add(Fr *res, const Fr *a, const Fr *b) { @@ -313,7 +313,7 @@ BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) { if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256! return BLST_BAD_SCALAR; } - vec_zero(tmp, Fr_BYTES); + vec_zero(tmp, sizeof(tmp)); limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES); return BLST_SUCCESS; } @@ -383,12 +383,12 @@ const limb_t BLS12_381_pR[Fp_LIMBS] = { ONE_MONT_P }; /* (1<<384)%p */ // sets `a` to 0 void Fp_set_zero(Fp* a){ - vec_zero((byte*)a, Fp_BYTES); + vec_zero((byte*)a, sizeof(Fp)); } // sets `a` to limb `l` void Fp_set_limb(Fp* a, const limb_t l){ - vec_zero((byte*)a + sizeof(limb_t), Fp_BYTES - sizeof(limb_t)); + vec_zero((byte*)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t)); *((limb_t*)a) = l; } @@ -949,12 +949,12 @@ void E2_write_bytes(byte *bin, const G2* a) { // set p to infinity void E2_set_infty(G2* p) { - vec_zero(p, G2_BYTES); + vec_zero(p, sizeof(G2)); } // check if `p` is infinity bool_t E2_is_infty(const G2* p) { - return vec_is_zero(p, sizeof(*p)); + return vec_is_zero(p, sizeof(G2)); } // checks affine point `p` is in E2 diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 5764ff64da8..e89bf755e4e 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -494,29 +494,27 @@ void bls_batchVerify(const int sigs_len, byte* results, const G2* pks_input, // the tree aggregations remain valid. // - valid points are multiplied by a random scalar (same for public keys at same index) // to make sure a signature at index (i) is verified against the public key at the same index. - - // choose a random non-zero coefficient of at least 128 bits - // TODO: find a way to generate randoms - bn_rand(r, RLC_POS, SEC_BITS); - bn_add_dig(r, r, 1); - Fr* tmp = Fr_relic_to_blst(r); - // multiply public key by the random exponent - E2_mult(&pks[i], &pks_input[i], tmp); - int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN); if (read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) { if (read_ret == UNDEFINED) {// unexpected error case goto out; }; - // set signature as infinity and set result as invalid - // this result won't be overwritten + // set signature and key to infinity (no effect on the aggregation tree) + // and set result to invalid (result won't be overwritten) + E2_set_infty(&pks[i]); ep_set_infty(&sigs[i]); results[i] = INVALID; } else { - // multiply the signature by the same random exponent + // choose a random non-zero coefficient of at least 128 bits + // TODO: find a way to generate randoms + bn_rand(r, RLC_POS, SEC_BITS); + bn_add_dig(r, r, 1); + Fr* tmp = Fr_relic_to_blst(r); + // multiply public key and signature by the same random exponent + E2_mult(&pks[i], &pks_input[i], tmp); + free(tmp); ep_mul_lwnaf(&sigs[i], &sigs[i], r); - } - free(tmp); + } } // build a binary tree of aggreagtions node* root = build_tree(sigs_len, &pks[0], &sigs[0]); diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index e9139183c3f..d074825e0e2 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -472,7 +472,6 @@ func VerifyBLSSignatureManyMessages( func BatchVerifyBLSSignaturesOneMessage( pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher, ) ([]bool, error) { - // empty list check if len(pks) == 0 { return []bool{}, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError) @@ -485,38 +484,48 @@ func BatchVerifyBLSSignaturesOneMessage( len(sigs)) } - verifBool := make([]bool, len(sigs)) + // return boolean array + returnBool := make([]bool, len(sigs)) + // temporary boolean array to hold the return values till all the return values are set + tmpBool := make([]bool, len(sigs)) + for i := range tmpBool { + tmpBool[i] = true // default to true + } if err := checkBLSHasher(kmac); err != nil { - return verifBool, err + return returnBool, err } - // an invalid signature with an incorrect header but correct length - invalidSig := make([]byte, signatureLengthBLSBLS12381) - invalidSig[0] = invalidBLSSignatureHeader // incorrect header - // flatten the shares (required by the C layer) flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs)) pkPoints := make([]pointG2, 0, len(pks)) + getIdentityPoint := func() pointG2 { + pk, _ := IdentityBLSPublicKey().(*pubKeyBLSBLS12381) // second value is guaranteed to be true + return pk.point + } + for i, pk := range pks { pkBLS, ok := pk.(*pubKeyBLSBLS12381) if !ok { - return verifBool, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) + return returnBool, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) } - pkPoints = append(pkPoints, pkBLS.point) if len(sigs[i]) != signatureLengthBLSBLS12381 || pkBLS.isIdentity { - // force the signature to be invalid by replacing it with an invalid array - // that fails the deserialization in C.ep_read_bin_compact - flatSigs = append(flatSigs, invalidSig...) + // case of invalid signature: set the signature and public key at index `i` + // to identities so that there is no effect on the aggregation tree computation. + // However, the boolean return for index `i` is set to `false` and won't be overwritten. + tmpBool[i] = false + pkPoints = append(pkPoints, getIdentityPoint()) + flatSigs = append(flatSigs, identityBLSSignature...) } else { + pkPoints = append(pkPoints, pkBLS.point) flatSigs = append(flatSigs, sigs[i]...) } } // hash the input to 128 bytes h := kmac.ComputeHash(message) - verifInt := make([]byte, len(verifBool)) + verifInt := make([]byte, len(returnBool)) C.bls_batchVerify( (C.int)(len(verifInt)), @@ -529,12 +538,16 @@ func BatchVerifyBLSSignaturesOneMessage( for i, v := range verifInt { if (C.int)(v) != valid && (C.int)(v) != invalid { - return verifBool, fmt.Errorf("batch verification failed") + return returnBool, fmt.Errorf("batch verification failed") + } + if tmpBool[i] { // only overwrite if not previously written + tmpBool[i] = ((C.int)(v) == valid) } - verifBool[i] = ((C.int)(v) == valid) } - return verifBool, nil + // make sure returnBool is []false till this point + copy(returnBool, tmpBool) + return returnBool, nil } // blsAggregateEmptyListError is returned when a list of BLS objects (e.g. signatures or keys) diff --git a/crypto/bls_test.go b/crypto/bls_test.go index ad29b088481..703ec9784b8 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -646,6 +646,27 @@ func TestBLSBatchVerify(t *testing.T) { sigs, sks, input, valid) }) + // valid signatures but indices aren't correct: sig[i] is correct under pks[j] + // and sig[j] is correct under pks[j]. + // implementations simply aggregating all signatures and keys would fail this test. + t.Run("valid signatures with incorrect indices", func(t *testing.T) { + i := mrand.Intn(sigsNum-1) + 1 + j := mrand.Intn(i) + // swap correct keys + pks[i], pks[j] = pks[j], pks[i] + + valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) + require.NoError(t, err) + expectedValid[i], expectedValid[j] = false, false + assert.Equal(t, valid, expectedValid, + "Verification of %s failed, private keys are %s, input is %x, results is %v", + sigs, sks, input, valid) + + // restore keys + pks[i], pks[j] = pks[j], pks[i] + expectedValid[i], expectedValid[j] = true, true + }) + // one valid signature t.Run("one valid signature", func(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:1], sigs[:1], input, kmac) @@ -745,6 +766,13 @@ func TestBLSBatchVerify(t *testing.T) { }) } +// Utility function that flips a point sign bit to negate the point +// this is shortcut which works only for zcash BLS12-381 compressed serialization +// Applicable to both signatures and public keys +func negatePoint(pointbytes []byte) { + pointbytes[0] ^= 0x20 +} + // alter or fix a signature func alterSignature(s Signature) { // this causes the signature to remain in G1 and be invalid @@ -1080,7 +1108,7 @@ func TestBLSIdentity(t *testing.T) { require.NoError(t, err) oppositeSig := make([]byte, signatureLengthBLSBLS12381) copy(oppositeSig, sig) - oppositeSig[0] ^= 0x20 // flip the last 3rd bit to flip the point sign + negatePoint(oppositeSig) aggSig, err := AggregateBLSSignatures([]Signature{sig, oppositeSig}) require.NoError(t, err) assert.True(t, IsBLSSignatureIdentity(aggSig)) From d70883bbbf9a4ddf3b2dab1612add7e0b3741b44 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 14 Apr 2023 22:37:34 -0600 Subject: [PATCH 030/200] fix a bug and minor updates --- crypto/bls12381_utils.c | 2 +- crypto/sign_test_utils.go | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 83569661ab1..38c665329a1 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -393,7 +393,7 @@ void Fp_set_limb(Fp* a, const limb_t l){ } void Fp_copy(Fp* res, const Fp* a) { - vec_copy((byte*)res, (byte*)a, sizeof(Fr)); + vec_copy((byte*)res, (byte*)a, sizeof(Fp)); } static void Fp_add(Fp *res, const Fp *a, const Fp *b) { diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go index 8a81e5bb45a..93895429dbe 100644 --- a/crypto/sign_test_utils.go +++ b/crypto/sign_test_utils.go @@ -47,7 +47,7 @@ func TestHasherErrors(t *testing.T) { // tests sign and verify are consistent for multiple generated keys and messages func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { - t.Run(fmt.Sprintf("Testing Generation/Signature/Verification for %s", salg), func(t *testing.T) { + t.Run(fmt.Sprintf("Generation/Signature/Verification for %s", salg), func(t *testing.T) { seed := make([]byte, KeyGenSeedMinLen) input := make([]byte, 100) r := time.Now().UnixNano() @@ -100,7 +100,6 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { require.NoError(t, err) assert.False(t, result, fmt.Sprintf( "Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen)) - } }) } @@ -154,7 +153,7 @@ func testKeyGenSeed(t *testing.T, salg SigningAlgorithm, minLen int, maxLen int) } func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { - t.Run(fmt.Sprintf("Testing encode/decode for %s", salg), func(t *testing.T) { + t.Run(fmt.Sprintf("encode/decode for %s", salg), func(t *testing.T) { r := time.Now().UnixNano() mrand.Seed(r) t.Logf("math rand seed is %d", r) @@ -259,7 +258,7 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { } func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorithm) { - t.Run(fmt.Sprintf("Testing Equals for %s", salg), func(t *testing.T) { + t.Run(fmt.Sprintf("equals for %s", salg), func(t *testing.T) { r := time.Now().UnixNano() mrand.Seed(r) t.Logf("math rand seed is %d", r) @@ -304,7 +303,7 @@ func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorit } func testKeysAlgorithm(t *testing.T, sk PrivateKey, salg SigningAlgorithm) { - t.Run(fmt.Sprintf("Testing key.Algorithm for %s", salg), func(t *testing.T) { + t.Run(fmt.Sprintf("key.Algorithm for %s", salg), func(t *testing.T) { alg := sk.Algorithm() assert.Equal(t, alg, salg) alg = sk.PublicKey().Algorithm() @@ -313,7 +312,7 @@ func testKeysAlgorithm(t *testing.T, sk PrivateKey, salg SigningAlgorithm) { } func testKeySize(t *testing.T, sk PrivateKey, skLen int, pkLen int) { - t.Run(fmt.Sprintf("Testing key.Size for %s", sk.Algorithm()), func(t *testing.T) { + t.Run(fmt.Sprintf("key.Size for %s", sk.Algorithm()), func(t *testing.T) { size := sk.Size() assert.Equal(t, size, skLen) size = sk.PublicKey().Size() From 54ee84ad291752c2577828b5950b686d66264633 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sat, 15 Apr 2023 19:31:48 -0600 Subject: [PATCH 031/200] BLS threshold signature works with new G2 type --- crypto/bls12381_utils.c | 14 +++++- crypto/bls12381_utils.h | 1 + crypto/bls_thresholdsign.go | 4 +- crypto/bls_thresholdsign_include.h | 2 +- crypto/bls_thresholdsign_test.go | 7 ++- crypto/dkg.go | 3 -- crypto/dkg_core.c | 73 +++++++++++------------------- crypto/dkg_feldmanvss.go | 19 ++++---- crypto/dkg_feldmanvssq.go | 7 ++- crypto/dkg_include.h | 12 ++--- crypto/dkg_jointfeldman.go | 3 -- crypto/dkg_test.go | 11 ++--- crypto/thresholdsign.go | 2 - 13 files changed, 67 insertions(+), 91 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 38c665329a1..61b54bb2686 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -293,6 +293,7 @@ static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES]) } } +// internal type of BLST `pow256` uses bytes little endian. static void pow256_from_Fr(pow256 ret, const Fr* in) { le_bytes_from_limbs(ret, (limb_t*)in, Fr_BYTES); } @@ -998,10 +999,19 @@ static void E2_neg(G2* a) { } // Exponentiation of a generic point `a` in E2, res = expo.a -void E2_mult(G2* res, const G2* a, const Fr* expo) { +void E2_mult(G2* res, const G2* p, const Fr* expo) { pow256 tmp; pow256_from_Fr(tmp, expo); - POINTonE2_sign((POINTonE2*)res, (POINTonE2*)a, tmp); + POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, tmp); +} + +// Exponentiation of a generic point `a` in E2 by a byte exponent. +void E2_mult_small_expo(G2* res, const G2* p, const byte expo) { + pow256 pow_expo; // `pow256` uses bytes little endian. + pow_expo[0] = expo; + vec_zero(&pow_expo[1], 32-1); + // TODO: to bench against a specific version of mult with 8 bits expo + POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, pow_expo); } // Exponentiation of generator g2 of G2, res = expo.g2 diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index a5f636c2655..a67932fd43b 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -147,6 +147,7 @@ BLST_ERROR E2_read_bytes(G2*, const byte *, const int); void E2_write_bytes(byte *, const G2*); void G2_mult_gen(G2*, const Fr*); void E2_mult(G2*, const G2*, const Fr*); +void E2_mult_small_expo(G2*, const G2*, const byte); void E2_add(G2* res, const G2* a, const G2* b); void E2_sum_vector(G2*, const G2*, const int); diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 094f4ebc692..3fec93d96f5 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -3,7 +3,6 @@ package crypto -/* // #cgo CFLAGS: // #include "bls_thresholdsign_include.h" import "C" @@ -580,7 +579,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, } // compute the shares for i := index(1); int(i) <= size; i++ { - C.Fr_polynomialImage( + C.Fr_polynomial_image( (*C.Fr)(&x[i-1]), (*C.G2)(&y[i-1]), (*C.Fr)(&a[0]), (C.int)(len(a)), @@ -604,4 +603,3 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, // generating an identity key is therefore negligible. return skShares, pkShares, pkGroup, nil } -*/ diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index b3e68f46328..861ba552241 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -6,6 +6,6 @@ #include "bls_include.h" int G1_lagrangeInterpolateAtZero_serialized(byte*, const byte* , const uint8_t[], const int); -extern void Fr_polynomialImage(Fr* out, ep2_t y, const Fr* a, const int a_size, const byte x); +extern void Fr_polynomial_image(Fr* out, G2* y, const Fr* a, const int a_size, const byte x); #endif diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 04fe28d4db4..6d873da6e68 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -3,7 +3,6 @@ package crypto -/* import ( "crypto/rand" "fmt" @@ -22,8 +21,8 @@ func TestBLSThresholdSignature(t *testing.T) { t.Run("centralized_stateless_keygen", testCentralizedStatelessAPI) // stateful API t.Run("centralized_stateful_keygen", testCentralizedStatefulAPI) - t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS) - t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case + //t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS) + //t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case } const thresholdSignatureTag = "random tag" @@ -648,4 +647,4 @@ func BenchmarkSignatureReconstruction(b *testing.B) { require.NoError(b, err) } b.StopTimer() -}*/ +} diff --git a/crypto/dkg.go b/crypto/dkg.go index 1254db615f3..1cdf87a128e 100644 --- a/crypto/dkg.go +++ b/crypto/dkg.go @@ -1,7 +1,5 @@ package crypto -/* - import ( "errors" "fmt" @@ -237,4 +235,3 @@ type DKGProcessor interface { // log describes the misbehavior. FlagMisbehavior(participant int, log string) } -*/ diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 9bf9dd8b2fc..0dd4844c08b 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -1,7 +1,7 @@ // +build relic #include "dkg_include.h" -/* + #define N_max 250 #define N_bits_max 8 // log(250) @@ -11,9 +11,9 @@ // r being the order of G1 // writes P(x) in out and P(x).g2 in y if y is non NULL // x being a small integer -void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size, const byte x){ +void Fr_polynomial_image_export(byte* out, G2* y, const Fr* a, const int a_size, const byte x){ Fr image; - Fr_polynomialImage(&image, y, a, a_size, x); + Fr_polynomial_image(&image, y, a, a_size, x); // exports the result Fr_write_bytes(out, &image); } @@ -21,7 +21,7 @@ void Fr_polynomialImage_export(byte* out, ep2_t y, const Fr* a, const int a_size // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. // a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255. // The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL -void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const byte x){ +void Fr_polynomial_image(Fr* image, G2* y, const Fr* a, const int a_size, const byte x){ Fr_set_zero(image); // convert `x` to Montgomery form Fr xR; @@ -34,78 +34,59 @@ void Fr_polynomialImage(Fr* image, ep2_t y, const Fr* a, const int a_size, const } // compute y = P(x).g2 if (y) { - bn_st* tmp = Fr_blst_to_relic(image); - g2_mul_gen(y, tmp); - free(tmp); + G2_mult_gen(y, image); } } // computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 // and stores the point in y -// r is the order of G2 -static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A, const byte x){ - - bn_t bn_x; - bn_new(bn_x); - ep2_set_infty(y); - bn_set_dig(bn_x, x); +static void G2_polynomial_image(G2* y, const G2* A, const int len_A, const byte x){ + E2_set_infty(y); for (int i = len_A-1; i >= 0 ; i--) { - ep2_mul_lwnaf(y, y, bn_x); - ep2_add_projc(y, y, (ep2_st*)&A[i]); + E2_mult_small_expo(y, y, x); // TODO: to bench against a specific version of mult with 8 bits expo + E2_add(y, y, &A[i]); } - - ep2_norm(y, y); // not necessary but called to optimize the - // multiple pairing computations with the same public key - bn_free(bn_x); } + // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y) // where Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2[X] -void G2_polynomialImages(ep2_st *y, const int len_y, const ep2_st* A, const int len_A) { +void G2_polynomial_images(G2 *y, const int len_y, const G2* A, const int len_A) { for (byte i=0; i Date: Sat, 15 Apr 2023 19:36:27 -0600 Subject: [PATCH 032/200] DKG works with new G2 type --- crypto/dkg_feldmanvssq.go | 2 +- crypto/dkg_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 2dfe25a6cb0..0bf5ad6445c 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -510,7 +510,7 @@ func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) boo // check y[complainer] == share.G2 return C.verify_share( (*C.Fr)(&c.answer), - (*C.G2)(&s.y[complainer])) != 0 + (*C.G2)(&s.y[complainer])) == 0 } // data = |complainee| diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index ff96730b855..a35d259f4f2 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -18,9 +18,9 @@ import ( var gt *testing.T func TestDKG(t *testing.T) { - //t.Run("FeldmanVSSSimple", testFeldmanVSSSimple) + t.Run("FeldmanVSSSimple", testFeldmanVSSSimple) t.Run("FeldmanVSSQual", testFeldmanVSSQual) - //t.Run("JointFeldman", testJointFeldman) + t.Run("JointFeldman", testJointFeldman) } // optimal threshold (t) to allow the largest number of malicious participants (m) From 693029f964f95f3e27c871180bccb6398af09b6a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sat, 15 Apr 2023 19:42:24 -0600 Subject: [PATCH 033/200] BLS-SPoCK works with new G2 type --- crypto/bls12381_utils.c | 9 ++++++--- crypto/bls12381_utils.h | 2 +- crypto/spock.go | 2 -- crypto/spock_test.go | 2 -- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 61b54bb2686..64ed4fae82c 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1037,7 +1037,7 @@ void E2_sum_vector(G2* jointy, const G2* y, const int len){ // Membership check in G2 of both keys is not verified in this function. // the membership check in G2 is separated to allow optimizing multiple verifications // using the same public keys. -int bls_spock_verify(const ep2_t pk1, const byte* sig1, const ep2_t pk2, const byte* sig2) { +int bls_spock_verify(const G2* pk1, const byte* sig1, const G2* pk2, const byte* sig2) { ep_t elemsG1[2]; ep2_t elemsG2[2]; @@ -1063,11 +1063,14 @@ int bls_spock_verify(const ep2_t pk1, const byte* sig1, const ep2_t pk2, const b // elemsG2[1] = pk1 ep2_new(elemsG2[1]); - ep2_copy(elemsG2[1], (ep2_st*)pk1); + ep2_st* tmp = E2_blst_to_relic(pk1); + ep2_copy(elemsG2[1], tmp); // elemsG2[0] = pk2 ep2_new(elemsG2[0]); - ep2_copy(elemsG2[0], (ep2_st*)pk2); + tmp = E2_blst_to_relic(pk2); + ep2_copy(elemsG2[0], tmp); + free(tmp); #if DOUBLE_PAIRING // elemsG2[0] = -pk2 diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index a67932fd43b..471f2bc7bcc 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -90,7 +90,7 @@ int get_invalid(); int get_Fr_BYTES(); // BLS based SPoCK -int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*); +int bls_spock_verify(const G2*, const byte*, const G2*, const byte*); // hash to curve functions (functions in bls12381_hashtocurve.c) void map_to_G1(ep_t, const byte*, const int); diff --git a/crypto/spock.go b/crypto/spock.go index 18c39f8af15..ce80a7f2275 100644 --- a/crypto/spock.go +++ b/crypto/spock.go @@ -3,7 +3,6 @@ package crypto -/* // SPoCK design based on the BLS signature scheme. // BLS is using BLS12-381 curve and the same settings in bls.go. @@ -105,4 +104,3 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur return false, fmt.Errorf("SPoCK verification failed") } } -*/ diff --git a/crypto/spock_test.go b/crypto/spock_test.go index 408e513bae0..45db590f04e 100644 --- a/crypto/spock_test.go +++ b/crypto/spock_test.go @@ -3,7 +3,6 @@ package crypto -/* import ( "crypto/rand" "testing" @@ -184,4 +183,3 @@ func TestSPOCKProveVerify(t *testing.T) { assert.False(t, result) }) } -*/ From 1a937638aa2541b61b38b2e4ab7d7ed93fd82c77 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 17 Apr 2023 14:47:03 -0600 Subject: [PATCH 034/200] clean up and G2 to E2 renaming --- crypto/bls.go | 12 +++---- crypto/bls12381_utils.c | 46 ++++++++++++------------ crypto/bls12381_utils.go | 58 +++++++++++++++--------------- crypto/bls12381_utils.h | 36 +++++++++---------- crypto/bls12381_utils_test.go | 34 +++++++++--------- crypto/bls_core.c | 26 +++++++------- crypto/bls_crossBLST_test.go | 2 +- crypto/bls_include.h | 8 ++--- crypto/bls_multisig.go | 36 +++++++++---------- crypto/bls_thresholdsign.go | 6 ++-- crypto/bls_thresholdsign_include.h | 2 +- crypto/blst_include.h | 22 ++++++------ crypto/dkg_core.c | 20 +++++------ crypto/dkg_feldmanvss.go | 34 +++++++++--------- crypto/dkg_feldmanvssq.go | 8 ++--- crypto/dkg_include.h | 12 +++---- crypto/dkg_jointfeldman.go | 26 +++++++------- crypto/spock.go | 4 +-- 18 files changed, 196 insertions(+), 196 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 65113f873ba..1375f7f0532 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -211,7 +211,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) return false, nil } - verif := C.bls_verify((*C.G2)(&pk.point), + verif := C.bls_verify((*C.E2)(&pk.point), (*C.uchar)(&s[0]), (*C.uchar)(&h[0]), (C.int)(len(h))) @@ -352,7 +352,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err } // membership check in G2 - if C.G2_check_membership((*C.G2)(&pk.point)) != valid { + if C.G2_check_membership((*C.E2)(&pk.point)) != valid { return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group") } @@ -460,7 +460,7 @@ type pubKeyBLSBLS12381 struct { // sure the comparison is performed after an instance is created. // // public key G2 point - point pointG2 + point pointE2 // G2 identity check cache isIdentity bool } @@ -468,7 +468,7 @@ type pubKeyBLSBLS12381 struct { // newPubKeyBLSBLS12381 creates a new BLS public key with the given point. // If no scalar is provided, the function allocates an // empty scalar. -func newPubKeyBLSBLS12381(p *pointG2) *pubKeyBLSBLS12381 { +func newPubKeyBLSBLS12381(p *pointE2) *pubKeyBLSBLS12381 { if p != nil { key := &pubKeyBLSBLS12381{ point: *p, @@ -546,9 +546,9 @@ func (a *blsBLS12381Algo) init() error { // This is only a TEST/DEBUG/BENCH function. // It returns the hash to G1 point from a slice of 128 bytes -func mapToG1(data []byte) *pointG1 { +func mapToG1(data []byte) *pointE1 { l := len(data) - var h pointG1 + var h pointE1 C.map_to_G1((*C.ep_st)(&h), (*C.uchar)(&data[0]), (C.int)(l)) return &h } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 64ed4fae82c..d08880e4d99 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -815,7 +815,7 @@ static int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) { } // TODO: temp utility function to delete -ep2_st* E2_blst_to_relic(const G2* x) { +ep2_st* E2_blst_to_relic(const E2* x) { ep2_st* out = (ep2_st*)malloc(sizeof(ep2_st)); byte* data = (byte*)malloc(G2_SER_BYTES); E2_write_bytes(data, x); @@ -837,7 +837,7 @@ ep2_st* E2_blst_to_relic(const G2* x) { // TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, // and update logic with G2 subgroup check? -BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) { +BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { // check the length if (len != G2_SER_BYTES) { return BLST_BAD_ENCODING; @@ -921,14 +921,14 @@ BLST_ERROR E2_read_bytes(G2* a, const byte *bin, const int len) { // The serialization follows: // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) // The code is a modified version of Relic ep2_write_bin -void E2_write_bytes(byte *bin, const G2* a) { +void E2_write_bytes(byte *bin, const E2* a) { if (E2_is_infty(a)) { // set the infinity bit bin[0] = (G2_SERIALIZATION << 7) | (1 << 6); memset(bin+1, 0, G2_SER_BYTES-1); return; } - G2 tmp; + E2 tmp; E2_to_affine(&tmp, a); Fp2* t_x = &(tmp.x); @@ -949,35 +949,35 @@ void E2_write_bytes(byte *bin, const G2* a) { } // set p to infinity -void E2_set_infty(G2* p) { - vec_zero(p, sizeof(G2)); +void E2_set_infty(E2* p) { + vec_zero(p, sizeof(E2)); } // check if `p` is infinity -bool_t E2_is_infty(const G2* p) { - return vec_is_zero(p, sizeof(G2)); +bool_t E2_is_infty(const E2* p) { + return vec_is_zero(p, sizeof(E2)); } // checks affine point `p` is in E2 -bool_t E2_affine_on_curve(const G2* p) { +bool_t E2_affine_on_curve(const E2* p) { // BLST's `POINTonE2_affine_on_curve` does not include the inifity case, // unlike what the function name means. return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p); } // checks p1 == p2 -bool_t E2_is_equal(const G2* p1, const G2* p2) { +bool_t E2_is_equal(const E2* p1, const E2* p2) { // `POINTonE2_is_equal` includes the infinity case return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2); } // res = p -void E2_copy(G2* res, const G2* p) { - vec_copy(res, p, sizeof(G2)); +void E2_copy(E2* res, const E2* p) { + vec_copy(res, p, sizeof(E2)); } // converts an E2 point from Jacobian into affine coordinates (z=1) -void E2_to_affine(G2* res, const G2* p) { +void E2_to_affine(E2* res, const E2* p) { // minor optimization in case coordinates are already affine if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) { E2_copy(res, p); @@ -988,25 +988,25 @@ void E2_to_affine(G2* res, const G2* p) { } // generic point addition that must handle doubling and points at infinity -void E2_add(G2* res, const G2* a, const G2* b) { +void E2_add(E2* res, const E2* a, const E2* b) { POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); } // Point negation in place. -// no need for an api of the form E2_neg(G2* res, const G2* a) for now -static void E2_neg(G2* a) { +// no need for an api of the form E2_neg(E2* res, const E2* a) for now +static void E2_neg(E2* a) { POINTonE2_cneg((POINTonE2*)a, 1); } // Exponentiation of a generic point `a` in E2, res = expo.a -void E2_mult(G2* res, const G2* p, const Fr* expo) { +void E2_mult(E2* res, const E2* p, const Fr* expo) { pow256 tmp; pow256_from_Fr(tmp, expo); POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, tmp); } // Exponentiation of a generic point `a` in E2 by a byte exponent. -void E2_mult_small_expo(G2* res, const G2* p, const byte expo) { +void E2_mult_small_expo(E2* res, const E2* p, const byte expo) { pow256 pow_expo; // `pow256` uses bytes little endian. pow_expo[0] = expo; vec_zero(&pow_expo[1], 32-1); @@ -1015,14 +1015,14 @@ void E2_mult_small_expo(G2* res, const G2* p, const byte expo) { } // Exponentiation of generator g2 of G2, res = expo.g2 -void G2_mult_gen(G2* res, const Fr* expo) { +void G2_mult_gen(E2* res, const Fr* expo) { pow256 tmp; pow256_from_Fr(tmp, expo); POINTonE2_sign((POINTonE2*)res, &BLS12_381_G2, tmp); } // computes the sum of the G2 array elements y and writes the sum in jointy -void E2_sum_vector(G2* jointy, const G2* y, const int len){ +void E2_sum_vector(E2* jointy, const E2* y, const int len){ E2_set_infty(jointy); for (int i=0; ix)); Fp2_print_(".y", &(a->y)); diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 2297e434c2f..c8c08e8ac0e 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -20,8 +20,8 @@ import ( // Go wrappers around BLST C types // Go wrappers around Relic C types -type pointG1 C.ep_st -type pointG2 C.G2 +type pointE1 C.ep_st +type pointE2 C.E2 type scalar C.Fr // BLS12-381 related lengths @@ -29,8 +29,8 @@ var frBytesLen = int(C.get_Fr_BYTES()) // TODO: For now scalars are represented as field elements Fr since all scalars // are less than r - check if distinguishing two types in necessary -//type pointG1_blst C.G1 -//type pointG2_blst C.G2 +//type pointG1_blst C.E1 +//type pointG2_blst C.E2 // context required for the BLS set-up type ctx struct { @@ -79,25 +79,25 @@ func seedRelic(seed []byte) error { } // Exponentiation in G1 (scalar point multiplication) -func (p *pointG1) scalarMultG1(res *pointG1, expo *scalar) { +func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) { C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.Fr)(expo)) } // This function is for TEST only // Exponentiation of g1 in G1 -func generatorScalarMultG1(res *pointG1, expo *scalar) { +func generatorScalarMultG1(res *pointE1, expo *scalar) { C.ep_mult_gen_bench((*C.ep_st)(res), (*C.Fr)(expo)) } // This function is for TEST only // Generic Exponentiation G1 -func genericScalarMultG1(res *pointG1, expo *scalar) { +func genericScalarMultG1(res *pointE1, expo *scalar) { C.ep_mult_generic_bench((*C.ep_st)(res), (*C.Fr)(expo)) } // Exponentiation of g2 in G2 -func generatorScalarMultG2(res *pointG2, expo *scalar) { - C.G2_mult_gen((*C.G2)(res), (*C.Fr)(expo)) +func generatorScalarMultG2(res *pointE2, expo *scalar) { + C.G2_mult_gen((*C.E2)(res), (*C.Fr)(expo)) } // comparison in Fr where r is the group order of G1/G2 @@ -107,8 +107,8 @@ func (x *scalar) equals(other *scalar) bool { } // comparison in G2 -func (p *pointG2) equals(other *pointG2) bool { - return C.E2_is_equal((*C.G2)(p), (*C.G2)(other)) != 0 +func (p *pointE2) equals(other *pointE2) bool { + return C.E2_is_equal((*C.E2)(p), (*C.E2)(other)) != 0 } // Comparison to zero in Fr. @@ -118,8 +118,8 @@ func (x *scalar) isZero() bool { } // Comparison to point at infinity in G2. -func (p *pointG2) isInfinity() bool { - return C.E2_is_infty((*C.G2)(p)) != 0 +func (p *pointE2) isInfinity() bool { + return C.E2_is_infty((*C.E2)(p)) != 0 } // returns a random element of Fr in input pointer @@ -165,16 +165,16 @@ func writeScalar(dest []byte, x *scalar) { // writePointG2 writes a G2 point in a slice of bytes // The slice should be of size PubKeyLenBLSBLS12381 and the serialization // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func writePointG2(dest []byte, a *pointG2) { +func writePointG2(dest []byte, a *pointE2) { C.E2_write_bytes((*C.uchar)(&dest[0]), - (*C.G2)(a), + (*C.E2)(a), ) } // writePointG1 writes a G1 point in a slice of bytes // The slice should be of size SignatureLenBLSBLS12381 and the serialization will // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func writePointG1(dest []byte, a *pointG1) { +func writePointG1(dest []byte, a *pointE1) { C.ep_write_bin_compact((*C.uchar)(&dest[0]), (*C.ep_st)(a), (C.int)(signatureLengthBLSBLS12381), @@ -206,8 +206,8 @@ func readScalarFrStar(a *scalar, src []byte) error { // readPointG2 reads a G2 point from a slice of bytes // The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func readPointG2(a *pointG2, src []byte) error { - read := C.E2_read_bytes((*C.G2)(a), +func readPointG2(a *pointE2, src []byte) error { + read := C.E2_read_bytes((*C.E2)(a), (*C.uchar)(&src[0]), (C.int)(len(src))) @@ -226,7 +226,7 @@ func readPointG2(a *pointG2, src []byte) error { // readPointG1 reads a G1 point from a slice of bytes // The slice should be of size SignatureLenBLSBLS12381 and the deserialization will // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func readPointG1(a *pointG1, src []byte) error { +func readPointG1(a *pointE1, src []byte) error { switch C.ep_read_bin_compact((*C.ep_st)(a), (*C.uchar)(&src[0]), (C.int)(len(src))) { @@ -241,39 +241,39 @@ func readPointG1(a *pointG1, src []byte) error { // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used // in go test files. -func checkMembershipG1(pt *pointG1) int { +func checkMembershipG1(pt *pointE1) int { return int(C.check_membership_G1((*C.ep_st)(pt))) } // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used // in go test files. -func checkMembershipG2(pt *pointG2) int { - return int(C.G2_check_membership((*C.G2)(pt))) +func checkMembershipG2(pt *pointE2) int { + return int(C.G2_check_membership((*C.E2)(pt))) } // randPointG1 wraps a call to C since cgo can't be used in go test files. // It generates a random point in G1 and stores it in input point. -func randPointG1(pt *pointG1) { +func randPointG1(pt *pointE1) { C.ep_rand_G1((*C.ep_st)(pt)) } // randPointG1Complement wraps a call to C since cgo can't be used in go test files. // It generates a random point in E1\G1 and stores it in input point. -func randPointG1Complement(pt *pointG1) { +func randPointG1Complement(pt *pointE1) { C.ep_rand_G1complement((*C.ep_st)(pt)) } /* // randPointG2 wraps a call to C since cgo can't be used in go test files. // It generates a random point in G2 and stores it in input point. -func randPointG2(pt *pointG2) { - C.ep2_rand_G2((*C.G2)(pt)) +func randPointG2(pt *pointE2) { + C.ep2_rand_G2((*C.E2)(pt)) } // randPointG1Complement wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. -func randPointG2Complement(pt *pointG2) { - C.ep2_rand_G2complement((*C.G2)(pt)) +func randPointG2Complement(pt *pointE2) { + C.ep2_rand_G2complement((*C.E2)(pt)) } */ @@ -295,7 +295,7 @@ func hashToG1Bytes(data, dst []byte) []byte { (*C.uchar)(&dst[0]), (C.int)(len(dst))) // map the hash to G1 - var point pointG1 + var point pointE1 C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) // serialize the point diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 471f2bc7bcc..b5477187dcd 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -83,14 +83,14 @@ typedef struct prec_ { // TODO: to delete when Relic is removed bn_st* Fr_blst_to_relic(const Fr* x); Fr* Fr_relic_to_blst(const bn_st* x); -ep2_st* E2_blst_to_relic(const G2* x); +ep2_st* E2_blst_to_relic(const E2* x); int get_valid(); int get_invalid(); int get_Fr_BYTES(); // BLS based SPoCK -int bls_spock_verify(const G2*, const byte*, const G2*, const byte*); +int bls_spock_verify(const E2*, const byte*, const E2*, const byte*); // hash to curve functions (functions in bls12381_hashtocurve.c) void map_to_G1(ep_t, const byte*, const int); @@ -137,24 +137,24 @@ int bowe_subgroup_check_G1(const ep_t); #endif // E2 and G2 utilities -void E2_set_infty(G2* p); -bool_t E2_is_infty(const G2*); -bool_t E2_affine_on_curve(const G2*); -bool_t E2_is_equal(const G2* p1, const G2* p2); -void E2_copy(G2*, const G2*); -void E2_to_affine(G2*, const G2*); -BLST_ERROR E2_read_bytes(G2*, const byte *, const int); -void E2_write_bytes(byte *, const G2*); -void G2_mult_gen(G2*, const Fr*); -void E2_mult(G2*, const G2*, const Fr*); -void E2_mult_small_expo(G2*, const G2*, const byte); -void E2_add(G2* res, const G2* a, const G2* b); -void E2_sum_vector(G2*, const G2*, const int); +void E2_set_infty(E2* p); +bool_t E2_is_infty(const E2*); +bool_t E2_affine_on_curve(const E2*); +bool_t E2_is_equal(const E2* p1, const E2* p2); +void E2_copy(E2*, const E2*); +void E2_to_affine(E2*, const E2*); +BLST_ERROR E2_read_bytes(E2*, const byte *, const int); +void E2_write_bytes(byte *, const E2*); +void G2_mult_gen(E2*, const Fr*); +void E2_mult(E2*, const E2*, const Fr*); +void E2_mult_small_expo(E2*, const E2*, const byte); +void E2_add(E2* res, const E2* a, const E2* b); +void E2_sum_vector(E2*, const E2*, const int); void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); -void E2_subtract_vector(G2* res, const G2* x, const G2* y, const int len); -int G2_check_membership(const G2*); +void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); +int G2_check_membership(const E2*); int simple_subgroup_check_G2(const ep2_t); void ep2_rand_G2(ep2_t); void ep2_rand_G2complement( ep2_t); @@ -173,7 +173,7 @@ void bytes_print_(char*, byte*, int); void Fr_print_(char*, Fr*); void Fp_print_(char*, Fp*); void Fp2_print_(char*, const Fp2*); -void E2_print_(char*, const G2*); +void E2_print_(char*, const E2*); void fp_print_(char*, fp_t); void bn_print_(char*, bn_st*); void ep_print_(char*, ep_st*); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 337849c78f3..cf0c37d7856 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -23,7 +23,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) { // G1 generator multiplication b.Run("G1 gen", func(b *testing.B) { - var res pointG1 + var res pointE1 b.ResetTimer() for i := 0; i < b.N; i++ { generatorScalarMultG1(&res, &expo) @@ -33,7 +33,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) { // G1 base point multiplication b.Run("G1 generic", func(b *testing.B) { - var res pointG1 + var res pointE1 b.ResetTimer() for i := 0; i < b.N; i++ { genericScalarMultG1(&res, &expo) @@ -43,7 +43,7 @@ func BenchmarkScalarMultG1G2(b *testing.B) { // G2 base point multiplication b.Run("G2 gen", func(b *testing.B) { - var res pointG2 + var res pointE2 b.ResetTimer() for i := 0; i < b.N; i++ { generatorScalarMultG2(&res, &expo) @@ -60,18 +60,18 @@ func TestMapToG1(t *testing.T) { msgs := [][]byte{ []byte{}, - //[]byte("abc"), - //[]byte("abcdef0123456789"), - //[]byte("q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq"), - //[]byte("a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"), + []byte("abc"), + []byte("abcdef0123456789"), + []byte("q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq"), + []byte("a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"), } expectedPointString := []string{ "052926add2207b76ca4fa57a8734416c8dc95e24501772c814278700eed6d1e4e8cf62d9c09db0fac349612b759e79a1", - //"03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903", - //"11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98", - //"15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488", - //"082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe", + "03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903", + "11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98", + "15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488", + "082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe", } for i, msg := range msgs { @@ -106,8 +106,8 @@ func TestSubgroupCheck(t *testing.T) { _, _ = rand.Read(seed) _ = seedRelic(seed) - t.Run("G1", func(t *testing.T) { - var p pointG1 + /*t.Run("G1", func(t *testing.T) { + var p pointE1 randPointG1(&p) // point in G1 res := checkMembershipG1(&p) assert.Equal(t, res, int(valid)) @@ -115,9 +115,9 @@ func TestSubgroupCheck(t *testing.T) { res = checkMembershipG1(&p) assert.Equal(t, res, int(invalid)) }) - /* + t.Run("G2", func(t *testing.T) { - var p pointG2 + var p pointE2 randPointG2(&p) // point in G2 res := checkMembershipG2(&p) assert.Equal(t, res, int(valid)) @@ -132,7 +132,7 @@ func TestSubgroupCheck(t *testing.T) { func BenchmarkSubgroupCheck(b *testing.B) { b.Run("G1", func(b *testing.B) { - var p pointG1 + var p pointE1 randPointG1(&p) b.ResetTimer() for i := 0; i < b.N; i++ { @@ -142,7 +142,7 @@ func BenchmarkSubgroupCheck(b *testing.B) { }) /* b.Run("G2", func(b *testing.B) { - var p pointG2 + var p pointE2 randPointG2(&p) b.ResetTimer() for i := 0; i < b.N; i++ { diff --git a/crypto/bls_core.c b/crypto/bls_core.c index e89bf755e4e..6315e711484 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -40,7 +40,7 @@ int check_membership_G1(const ep_t p){ // // membership check in G2 is using a scalar multiplication by the group order. // TODO: switch to the faster Bowe check -int G2_check_membership(const G2* p){ +int G2_check_membership(const E2* p){ #if MEMBERSHIP_CHECK // check p is on curve if (!E2_affine_on_curve(p)) // TODO: remove and assume inputs are on curve? @@ -84,7 +84,7 @@ void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { // and a message data. // The signature and public key are assumed to be in G1 and G2 respectively. This // function only checks the pairing equality. -static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int len) { +static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int len) { ep_t elemsG1[2]; ep2_t elemsG2[2]; @@ -153,7 +153,7 @@ static int bls_verify_ep(const G2* pk, const ep_t s, const byte* data, const int // the membership check is separated to allow optimizing multiple verifications using the same pks int bls_verifyPerDistinctMessage(const byte* sig, const int nb_hashes, const byte* hashes, const uint32_t* len_hashes, - const uint32_t* pks_per_hash, const G2* pks) { + const uint32_t* pks_per_hash, const E2* pks) { int ret = UNDEFINED; // return value @@ -189,7 +189,7 @@ int bls_verifyPerDistinctMessage(const byte* sig, // aggregate public keys mapping to the same hash offset = 0; - G2 tmp; + E2 tmp; for (int i=1; i < nb_hashes+1; i++) { // elemsG2[i] = agg_pk[i] E2_sum_vector(&tmp, &pks[offset] , pks_per_hash[i-1]); @@ -241,7 +241,7 @@ int bls_verifyPerDistinctMessage(const byte* sig, // membership check of pks in G2 is not verified in this function // the membership check is separated to allow optimizing multiple verifications using the same pks int bls_verifyPerDistinctKey(const byte* sig, - const int nb_pks, const G2* pks, const uint32_t* hashes_per_pk, + const int nb_pks, const E2* pks, const uint32_t* hashes_per_pk, const byte* hashes, const uint32_t* len_hashes){ int ret = UNDEFINED; // return value @@ -335,7 +335,7 @@ int bls_verifyPerDistinctKey(const byte* sig, // membership check of the signature in G1 is verified. // membership check of pk in G2 is not verified in this function. // the membership check in G2 is separated to allow optimizing multiple verifications using the same key. -int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) { +int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) { ep_t s; ep_new(s); @@ -360,15 +360,15 @@ int bls_verify(const G2* pk, const byte* sig, const byte* data, const int len) { // The leaves contain the initial signatures and public keys. typedef struct st_node { ep_st* sig; - G2* pk; + E2* pk; struct st_node* left; struct st_node* right; } node; -static node* new_node(const G2* pk, const ep_st* sig){ +static node* new_node(const E2* pk, const ep_st* sig){ node* t = (node*) malloc(sizeof(node)); if (t) { - t->pk = (G2*)pk; + t->pk = (E2*)pk; t->sig = (ep_st*)sig; t->right = t->left = NULL; } @@ -395,7 +395,7 @@ static void free_tree(node* root) { } // builds a binary tree of aggregation of signatures and public keys recursively. -static node* build_tree(const int len, const G2* pks, const ep_st* sigs) { +static node* build_tree(const int len, const E2* pks, const ep_st* sigs) { // check if a leaf is reached if (len == 1) { return new_node(&pks[0], &sigs[0]); // use the first element of the arrays @@ -406,7 +406,7 @@ static node* build_tree(const int len, const G2* pks, const ep_st* sigs) { int left_len = len - right_len; // create a new node with new points - G2* new_pk = (G2*)malloc(sizeof(G2)); + E2* new_pk = (E2*)malloc(sizeof(E2)); if (!new_pk) goto error; ep_st* new_sig = (ep_st*)malloc(sizeof(ep_st)); if (!new_sig) goto error_sig; @@ -471,14 +471,14 @@ static void bls_batchVerify_tree(const node* root, const int len, byte* results, // indices mixup. // - optimize the verification by verifying an aggregated signature against an aggregated // public key, and use a recursive verification to find invalid signatures. -void bls_batchVerify(const int sigs_len, byte* results, const G2* pks_input, +void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input, const byte* sigs_bytes, const byte* data, const int data_len) { // initialize results to undefined memset(results, UNDEFINED, sigs_len); // build the arrays of G1 and G2 elements to verify - G2* pks = (G2*) malloc(sigs_len * sizeof(G2)); + E2* pks = (E2*) malloc(sigs_len * sizeof(E2)); if (!pks) return; ep_st* sigs = (ep_st*) malloc(sigs_len * sizeof(ep_st)); if (!sigs) goto out_sigs; diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index 143454dfb25..949e1f6d3b7 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -151,7 +151,7 @@ func testEncodeDecodeSignatureCrossBLST(t *rapid.T) { sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte) // check decoding results are consistent - var pointFlow pointG1 + var pointFlow pointE1 // here we test readPointG1 rather than the simple Signature type alias err := readPointG1(&pointFlow, sigBytes) flowPass := (err == nil) && (checkMembershipG1(&pointFlow) == int(valid)) diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 32b9f506c8c..f81f2839bcf 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -36,13 +36,13 @@ int get_pk_len(); int get_sk_len(); void bls_sign(byte*, const Fr*, const byte*, const int); -int bls_verify(const G2*, const byte*, const byte*, const int); +int bls_verify(const E2*, const byte*, const byte*, const int); int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, - const uint32_t*, const G2*); + const uint32_t*, const E2*); int bls_verifyPerDistinctKey(const byte*, - const int, const G2*, const uint32_t*, + const int, const E2*, const uint32_t*, const byte*, const uint32_t*); -void bls_batchVerify(const int, byte*, const G2*, +void bls_batchVerify(const int, byte*, const E2*, const byte*, const byte*, const int); #endif diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index d074825e0e2..e6589a60031 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -183,7 +183,7 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { return nil, blsAggregateEmptyListError } - points := make([]pointG2, 0, len(keys)) + points := make([]pointE2, 0, len(keys)) for i, pk := range keys { pkBLS, ok := pk.(*pubKeyBLSBLS12381) if !ok { @@ -192,8 +192,8 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { points = append(points, pkBLS.point) } - var sum pointG2 - C.E2_sum_vector((*C.G2)(&sum), (*C.G2)(&points[0]), + var sum pointE2 + C.E2_sum_vector((*C.E2)(&sum), (*C.E2)(&points[0]), (C.int)(len(points))) sumKey := newPubKeyBLSBLS12381(&sum) @@ -207,7 +207,7 @@ func IdentityBLSPublicKey() PublicKey { identity := *newPubKeyBLSBLS12381(nil) // set the point to infinity - C.E2_set_infty((*C.G2)(&identity.point)) + C.E2_set_infty((*C.E2)(&identity.point)) identity.isIdentity = true return &identity } @@ -233,7 +233,7 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, return nil, notBLSKeyError } - pointsToSubtract := make([]pointG2, 0, len(keysToRemove)) + pointsToSubtract := make([]pointE2, 0, len(keysToRemove)) for i, pk := range keysToRemove { pkBLS, ok := pk.(*pubKeyBLSBLS12381) if !ok { @@ -247,9 +247,9 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, return aggKey, nil } - var resultPoint pointG2 - C.E2_subtract_vector((*C.G2)(&resultPoint), (*C.G2)(&aggPKBLS.point), - (*C.G2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract))) + var resultPoint pointE2 + C.E2_subtract_vector((*C.E2)(&resultPoint), (*C.E2)(&aggPKBLS.point), + (*C.E2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract))) resultKey := newPubKeyBLSBLS12381(&resultPoint) return resultKey, nil @@ -356,13 +356,13 @@ func VerifyBLSSignatureManyMessages( // The comparison of the maps length minimizes the number of pairings to // compute by aggregating either public keys or the message hashes in // the verification equation. - mapPerHash := make(map[string][]pointG2) - mapPerPk := make(map[pointG2][][]byte) + mapPerHash := make(map[string][]pointE2) + mapPerPk := make(map[pointE2][][]byte) // Note: mapPerPk is using a cgo structure as map keys which may lead to 2 equal public keys // being considered distinct. This does not make the verification equation wrong but leads to // computing extra pairings. This case is considered unlikely to happen since a caller is likely // to use the same struct for a same public key. - // One way to fix this is to use the public key encoding as the map keys and store the "pointG2" + // One way to fix this is to use the public key encoding as the map keys and store the "pointE2" // structure with the map value, which adds more complexity and processing time. // fill the 2 maps @@ -390,7 +390,7 @@ func VerifyBLSSignatureManyMessages( flatDistinctHashes := make([]byte, 0) lenHashes := make([]uint32, 0) pkPerHash := make([]uint32, 0, len(mapPerHash)) - allPks := make([]pointG2, 0) + allPks := make([]pointE2, 0) for hash, pksVal := range mapPerHash { flatDistinctHashes = append(flatDistinctHashes, []byte(hash)...) lenHashes = append(lenHashes, uint32(len([]byte(hash)))) @@ -403,13 +403,13 @@ func VerifyBLSSignatureManyMessages( (*C.uchar)(&flatDistinctHashes[0]), (*C.uint32_t)(&lenHashes[0]), (*C.uint32_t)(&pkPerHash[0]), - (*C.G2)(&allPks[0]), + (*C.E2)(&allPks[0]), ) } else { // aggregate hashes per distinct key // using the linearity of the pairing on the G1 variables. - distinctPks := make([]pointG2, 0, len(mapPerPk)) + distinctPks := make([]pointE2, 0, len(mapPerPk)) hashPerPk := make([]uint32, 0, len(mapPerPk)) flatHashes := make([]byte, 0) lenHashes := make([]uint32, 0) @@ -425,7 +425,7 @@ func VerifyBLSSignatureManyMessages( verif = C.bls_verifyPerDistinctKey( (*C.uchar)(&s[0]), (C.int)(len(mapPerPk)), - (*C.G2)(&distinctPks[0]), + (*C.E2)(&distinctPks[0]), (*C.uint32_t)(&hashPerPk[0]), (*C.uchar)(&flatHashes[0]), (*C.uint32_t)(&lenHashes[0])) @@ -497,9 +497,9 @@ func BatchVerifyBLSSignaturesOneMessage( // flatten the shares (required by the C layer) flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs)) - pkPoints := make([]pointG2, 0, len(pks)) + pkPoints := make([]pointE2, 0, len(pks)) - getIdentityPoint := func() pointG2 { + getIdentityPoint := func() pointE2 { pk, _ := IdentityBLSPublicKey().(*pubKeyBLSBLS12381) // second value is guaranteed to be true return pk.point } @@ -530,7 +530,7 @@ func BatchVerifyBLSSignaturesOneMessage( C.bls_batchVerify( (C.int)(len(verifInt)), (*C.uchar)(&verifInt[0]), - (*C.G2)(&pkPoints[0]), + (*C.E2)(&pkPoints[0]), (*C.uchar)(&flatSigs[0]), (*C.uchar)(&h[0]), (C.int)(len(h)), diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 3fec93d96f5..72fa421def3 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -555,8 +555,8 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, // the scalars x and G2 points y x := make([]scalar, size) - y := make([]pointG2, size) - var X0 pointG2 + y := make([]pointE2, size) + var X0 pointE2 // seed relic if err := seedRelic(seed); err != nil { @@ -581,7 +581,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, for i := index(1); int(i) <= size; i++ { C.Fr_polynomial_image( (*C.Fr)(&x[i-1]), - (*C.G2)(&y[i-1]), + (*C.E2)(&y[i-1]), (*C.Fr)(&a[0]), (C.int)(len(a)), (C.uint8_t)(i), ) diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index 861ba552241..1bc5809d405 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -6,6 +6,6 @@ #include "bls_include.h" int G1_lagrangeInterpolateAtZero_serialized(byte*, const byte* , const uint8_t[], const int); -extern void Fr_polynomial_image(Fr* out, G2* y, const Fr* a, const int a_size, const byte x); +extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x); #endif diff --git a/crypto/blst_include.h b/crypto/blst_include.h index c480a68d27e..65f552d6fae 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -77,12 +77,13 @@ typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // TODO: use Fr_LIMBS // `Fp` does not need to be exported to cgo. typedef vec384 Fp; -// Subroup G1 in E1 -// G1 points are represented in Jacobian coordinates (x,y,z), +// curve E_1 (over F_p) +// E_1 points are represented in Jacobian coordinates (x,y,z), // where x, y, x are elements of F_p (type `Fp`). -// `G1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements) -// `G1` is defined as a struct to be exportable through cgo to the Go layer. -typedef struct {Fp x,y,z;} G1; +// `E1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian E1 elements) +// `E1` is defined as a struct to be exportable through cgo to the Go layer. +// `E1` is also used to represent all subgroup G_1 elements. +typedef struct {Fp x,y,z;} E1; // field elements F_p^2 // F_p^2 elements are represented as a vector of two F_p elements. @@ -94,11 +95,12 @@ typedef vec384x Fp2; #define imag(p) ((*(p))[1]) -// Subroup G2 in E2 -// G2 points are represented in Jacobian coordinates (x,y,z), +// curve E_2 (over F_p^2) +// E_2 points are represented in Jacobian coordinates (x,y,z), // where x, y, x are elements of F_p (type `Fp`). -// `G2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E1 elements) -// `G2` is defined as a struct to be exportable through cgo to the Go layer. -typedef struct {Fp2 x,y,z;} G2; +// `E2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian E2 elements) +// `E2` is defined as a struct to be exportable through cgo to the Go layer. +// `E2` is also used to represent all subgroup G_2 elements. +typedef struct {Fp2 x,y,z;} E2; #endif diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 0dd4844c08b..aedf5d83164 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -11,7 +11,7 @@ // r being the order of G1 // writes P(x) in out and P(x).g2 in y if y is non NULL // x being a small integer -void Fr_polynomial_image_export(byte* out, G2* y, const Fr* a, const int a_size, const byte x){ +void Fr_polynomial_image_export(byte* out, E2* y, const Fr* a, const int a_size, const byte x){ Fr image; Fr_polynomial_image(&image, y, a, a_size, x); // exports the result @@ -21,7 +21,7 @@ void Fr_polynomial_image_export(byte* out, G2* y, const Fr* a, const int a_size, // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. // a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255. // The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL -void Fr_polynomial_image(Fr* image, G2* y, const Fr* a, const int a_size, const byte x){ +void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int a_size, const byte x){ Fr_set_zero(image); // convert `x` to Montgomery form Fr xR; @@ -40,7 +40,7 @@ void Fr_polynomial_image(Fr* image, G2* y, const Fr* a, const int a_size, const // computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 // and stores the point in y -static void G2_polynomial_image(G2* y, const G2* A, const int len_A, const byte x){ +static void E2_polynomial_image(E2* y, const E2* A, const int len_A, const byte x){ E2_set_infty(y); for (int i = len_A-1; i >= 0 ; i--) { E2_mult_small_expo(y, y, x); // TODO: to bench against a specific version of mult with 8 bits expo @@ -51,17 +51,17 @@ static void G2_polynomial_image(G2* y, const G2* A, const int len_A, const byte // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y) // where Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2[X] -void G2_polynomial_images(G2 *y, const int len_y, const G2* A, const int len_A) { +void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A) { for (byte i=0; i Date: Mon, 17 Apr 2023 15:22:54 -0600 Subject: [PATCH 035/200] rename some G1/G2 functions to E1/E2 --- crypto/bls.go | 2 +- crypto/bls12381_utils.c | 8 ++++---- crypto/bls12381_utils.go | 20 +++++++++++--------- crypto/bls12381_utils.h | 6 +++--- crypto/bls_core.c | 14 +++++++------- crypto/bls_crossBLST_test.go | 4 ++-- 6 files changed, 28 insertions(+), 26 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 1375f7f0532..d45ea7f3aeb 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -346,7 +346,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err pubKeyLengthBLSBLS12381, len(publicKeyBytes)) } var pk pubKeyBLSBLS12381 - err := readPointG2(&pk.point, publicKeyBytes) + err := readPointE2(&pk.point, publicKeyBytes) if err != nil { return nil, fmt.Errorf("decode public key failed: %w", err) } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index d08880e4d99..9b91e8e0ebd 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1048,7 +1048,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* return read_ret; // check s1 is in G1 - if (check_membership_G1(elemsG1[0]) != VALID) // only enabled if MEMBERSHIP_CHECK==1 + if (G1_check_membership(elemsG1[0]) != VALID) // only enabled if MEMBERSHIP_CHECK==1 return INVALID; // elemsG1[1] = s2 @@ -1058,7 +1058,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* return read_ret; // check s2 in G1 - if (check_membership_G1(elemsG1[1]) != VALID) // only enabled if MEMBERSHIP_CHECK==1 + if (G1_check_membership(elemsG1[1]) != VALID) // only enabled if MEMBERSHIP_CHECK==1 return INVALID; // elemsG2[1] = pk1 @@ -1160,7 +1160,7 @@ int ep_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int len) { // uses a simple scalar multiplication by G1's order // to check whether a point on the curve E1 is in G1. -int simple_subgroup_check_G1(const ep_t p){ +int G1_simple_subgroup_check(const ep_t p){ ep_t inf; ep_new(inf); // check p^order == infinity @@ -1176,7 +1176,7 @@ int simple_subgroup_check_G1(const ep_t p){ // uses a simple scalar multiplication by G1's order // to check whether a point on the curve E2 is in G2. -int simple_subgroup_check_G2(const ep2_t p){ +int G2_simple_subgroup_check(const ep2_t p){ ep2_t inf; ep2_new(inf); // check p^order == infinity diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index c8c08e8ac0e..59776fcec5b 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -203,10 +203,11 @@ func readScalarFrStar(a *scalar, src []byte) error { } -// readPointG2 reads a G2 point from a slice of bytes -// The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will -// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func readPointG2(a *pointE2, src []byte) error { +// readPointE2 reads a E2 point from a slice of bytes +// The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization +// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. +// No G2 membership check is performed. +func readPointE2(a *pointE2, src []byte) error { read := C.E2_read_bytes((*C.E2)(a), (*C.uchar)(&src[0]), (C.int)(len(src))) @@ -223,10 +224,11 @@ func readPointG2(a *pointE2, src []byte) error { } } -// readPointG1 reads a G1 point from a slice of bytes -// The slice should be of size SignatureLenBLSBLS12381 and the deserialization will -// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func readPointG1(a *pointE1, src []byte) error { +// readPointE1 reads a E1 point from a slice of bytes +// The slice should be of size SignatureLenBLSBLS12381 and the deserialization +// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. +// No G1 membership check is performed. +func readPointE1(a *pointE1, src []byte) error { switch C.ep_read_bin_compact((*C.ep_st)(a), (*C.uchar)(&src[0]), (C.int)(len(src))) { @@ -242,7 +244,7 @@ func readPointG1(a *pointE1, src []byte) error { // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used // in go test files. func checkMembershipG1(pt *pointE1) int { - return int(C.check_membership_G1((*C.ep_st)(pt))) + return int(C.G1_check_membership((*C.ep_st)(pt))) } // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index b5477187dcd..01f68610603 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -128,8 +128,8 @@ void ep_mult_generic_bench(ep_t, const Fr*); void ep_mult(ep_t, const ep_t, const Fr*); void ep_sum_vector(ep_t, ep_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); -int check_membership_G1(const ep_t); -int simple_subgroup_check_G1(const ep_t); +int G1_check_membership(const ep_t); +int G1_simple_subgroup_check(const ep_t); void ep_rand_G1(ep_t); void ep_rand_G1complement( ep_t); #if (MEMBERSHIP_CHECK_G1 == BOWE) @@ -155,7 +155,7 @@ void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); int G2_check_membership(const E2*); -int simple_subgroup_check_G2(const ep2_t); +int G2_simple_subgroup_check(const ep2_t); void ep2_rand_G2(ep2_t); void ep2_rand_G2complement( ep2_t); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 6315e711484..eae1382e6a1 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -21,10 +21,10 @@ int get_sk_len() { // Checks if input point p is in the subgroup G1. // The function assumes the input is known to be on the curve E1. -int check_membership_G1(const ep_t p){ +int G1_check_membership(const ep_t p){ #if MEMBERSHIP_CHECK #if MEMBERSHIP_CHECK_G1 == EXP_ORDER - return simple_subgroup_check_G1(p); + return G1_simple_subgroup_check(p); #elif MEMBERSHIP_CHECK_G1 == BOWE // section 3.2 from https://eprint.iacr.org/2019/814.pdf return bowe_subgroup_check_G1(p); @@ -47,7 +47,7 @@ int G2_check_membership(const E2* p){ return INVALID; // check p is in G2 #if MEMBERSHIP_CHECK_G2 == EXP_ORDER - return simple_subgroup_check_G2(p); + return G2_simple_subgroup_check(p); #elif MEMBERSHIP_CHECK_G2 == BOWE // TODO: implement Bowe's check return UNDEFINED; @@ -172,7 +172,7 @@ int bls_verifyPerDistinctMessage(const byte* sig, if (ret != RLC_OK) goto out; // check s is in G1 - ret = check_membership_G1(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1 + ret = G1_check_membership(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1 if (ret != VALID) goto out; // elemsG2[0] = -g2 @@ -260,7 +260,7 @@ int bls_verifyPerDistinctKey(const byte* sig, if (ret != RLC_OK) goto out; // check s in G1 - ret = check_membership_G1(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1 + ret = G1_check_membership(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1 if (ret != VALID) goto out; // elemsG2[0] = -g2 @@ -346,7 +346,7 @@ int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) { } // check s is in G1 - if (check_membership_G1(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1 + if (G1_check_membership(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1 return INVALID; } @@ -495,7 +495,7 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input, // - valid points are multiplied by a random scalar (same for public keys at same index) // to make sure a signature at index (i) is verified against the public key at the same index. int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN); - if (read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) { + if (read_ret != RLC_OK || G1_check_membership(&sigs[i]) != VALID) { if (read_ret == UNDEFINED) {// unexpected error case goto out; }; diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index 949e1f6d3b7..e9f9a902d0b 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -152,8 +152,8 @@ func testEncodeDecodeSignatureCrossBLST(t *rapid.T) { // check decoding results are consistent var pointFlow pointE1 - // here we test readPointG1 rather than the simple Signature type alias - err := readPointG1(&pointFlow, sigBytes) + // here we test readPointE1 rather than the simple Signature type alias + err := readPointE1(&pointFlow, sigBytes) flowPass := (err == nil) && (checkMembershipG1(&pointFlow) == int(valid)) var pointBLST blst.P1Affine From f1045056e2f979b7f1f614269fa867c7d8b9db12 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 17 Apr 2023 18:54:46 -0600 Subject: [PATCH 036/200] various renaming in DKG functions --- crypto/bls_thresholdsign.go | 4 ++-- crypto/bls_thresholdsign_core.c | 12 ++++++------ crypto/bls_thresholdsign_include.h | 2 +- crypto/dkg_core.c | 15 ++++++++------- crypto/dkg_feldmanvss.go | 4 ++-- crypto/dkg_feldmanvssq.go | 2 +- crypto/dkg_include.h | 4 ++-- 7 files changed, 22 insertions(+), 21 deletions(-) diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 72fa421def3..5ff2e3a4550 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -413,7 +413,7 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat } // Lagrange Interpolate at point 0 - result := C.G1_lagrangeInterpolateAtZero_serialized( + result := C.E1_lagrange_interpolate_at_zero_write( (*C.uchar)(&thresholdSignature[0]), (*C.uchar)(&shares[0]), (*C.uint8_t)(&signers[0]), (C.int)(s.threshold+1)) @@ -501,7 +501,7 @@ func BLSReconstructThresholdSignature(size int, threshold int, thresholdSignature := make([]byte, signatureLengthBLSBLS12381) // Lagrange Interpolate at point 0 - if C.G1_lagrangeInterpolateAtZero_serialized( + if C.E1_lagrange_interpolate_at_zero_write( (*C.uchar)(&thresholdSignature[0]), (*C.uchar)(&flatShares[0]), (*C.uint8_t)(&indexSigners[0]), (C.int)(threshold+1), diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index 75542763f6a..96d07f2a42e 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -9,7 +9,7 @@ // Computes the Lagrange coefficient L_i(0) in Fr with regards to the range [indices(0)..indices(t)] // and stores it in `res`, where t is the degree of the polynomial P. // `len` is equal to `t+1` where `t` is the polynomial degree. -static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t indices[], const int len){ +static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indices[], const int len){ // coefficient is computed as N * D^(-1) Fr numerator; // eventually would represent N*R^k @@ -65,7 +65,7 @@ static void Fr_lagrangeCoefficientAtZero(Fr* res, const int i, const uint8_t ind // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] // and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`. // `len` is equal to `t+1` where `t` is the polynomial degree. -static void G1_lagrangeInterpolateAtZero(ep_st* dest, const ep_st shares[], const uint8_t indices[], const int len) { +static void E1_lagrange_interpolate_at_zero(ep_st* dest, const ep_st shares[], const uint8_t indices[], const int len) { // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1 // where A_i = g1 ^ a_i @@ -79,7 +79,7 @@ static void G1_lagrangeInterpolateAtZero(ep_st* dest, const ep_st shares[], cons Fr fr_lagr_coef; for (int i=0; i < len; i++) { - Fr_lagrangeCoefficientAtZero(&fr_lagr_coef, i, indices, len); + Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len); bn_st* bn_lagr_coef = Fr_blst_to_relic(&fr_lagr_coef); ep_mul_lwnaf(mult, &shares[i], bn_lagr_coef); free(bn_lagr_coef); @@ -90,9 +90,9 @@ static void G1_lagrangeInterpolateAtZero(ep_st* dest, const ep_st shares[], cons } // Computes the Langrange interpolation at zero LI(0) with regards to the indices [indices(0)..indices(t)] -// and their G1 concatenated serializations [shares(1)..shares(t+1)], and stores the serialized result in `dest`. +// and writes their E1 concatenated serializations [shares(1)..shares(t+1)] in `dest`. // `len` is equal to `t+1` where `t` is the polynomial degree. -int G1_lagrangeInterpolateAtZero_serialized(byte* dest, const byte* shares, const uint8_t indices[], const int len) { +int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const uint8_t indices[], const int len) { int read_ret; // temp variables ep_t res; @@ -108,7 +108,7 @@ int G1_lagrangeInterpolateAtZero_serialized(byte* dest, const byte* shares, cons // G1 interpolation at 0 // computes Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1, // where A_i = g1 ^ a_i - G1_lagrangeInterpolateAtZero(res, ep_shares, indices, len); + E1_lagrange_interpolate_at_zero(res, ep_shares, indices, len); // export the result ep_write_bin_compact(dest, res, SIGNATURE_LEN); diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index 1bc5809d405..e39e4a06887 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -5,7 +5,7 @@ #include "bls_include.h" -int G1_lagrangeInterpolateAtZero_serialized(byte*, const byte* , const uint8_t[], const int); +int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int); extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x); #endif diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index aedf5d83164..9b51c89d32b 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -8,10 +8,10 @@ #define T_max ((N_max-1)/2) // computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r) -// r being the order of G1 -// writes P(x) in out and P(x).g2 in y if y is non NULL -// x being a small integer -void Fr_polynomial_image_export(byte* out, E2* y, const Fr* a, const int a_size, const byte x){ +// r being the order of G1, +// and writes P(x) in out and P(x).g2 in y if y is non NULL +// x being a small integer (byte). +void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int a_size, const byte x){ Fr image; Fr_polynomial_image(&image, y, a, a_size, x); // exports the result @@ -83,9 +83,10 @@ BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){ return BLST_SUCCESS; } -// returns 1 if g2^x = y, where g2 is the generator of G2 -// returns 0 otherwise -bool_t verify_share(const Fr* x, const E2* y) { +// checks the discrete log relationship in G2. +// - returns 1 if g2^x = y, where g2 is the generator of G2 +// - returns 0 otherwise. +bool_t G2_check_log(const Fr* x, const E2* y) { E2 tmp; G2_mult_gen(&tmp, x); return E2_is_equal(&tmp, y); diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 51922814b17..fbc4e5eaf68 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -412,7 +412,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { // P(x) is written in dest, while g2^P(x) is written in y // x being a small integer func frPolynomialImage(dest []byte, a []scalar, x index, y *pointE2) { - C.Fr_polynomial_image_export((*C.uchar)(&dest[0]), + C.Fr_polynomial_image_write((*C.uchar)(&dest[0]), (*C.E2)(y), (*C.Fr)(&a[0]), (C.int)(len(a)), (C.uint8_t)(x), @@ -444,7 +444,7 @@ func readVerifVector(A []pointE2, src []byte) error { func (s *feldmanVSSstate) verifyShare() bool { // check y[current] == x.G2 - return C.verify_share( + return C.G2_check_log( (*C.Fr)(&s.x), (*C.E2)(&s.y[s.myIndex])) != 0 } diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 38b3667ffae..ae929aa49ff 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -506,7 +506,7 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index) // - true if the complaint answer is not correct func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool { // check y[complainer] == share.G2 - return C.verify_share( + return C.G2_check_log( (*C.Fr)(&c.answer), (*C.E2)(&s.y[complainer])) == 0 } diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index 8a1248cacd9..c467a43714b 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -5,11 +5,11 @@ #include "bls12381_utils.h" -void Fr_polynomial_image_export(byte* out, E2* y, const Fr* a, const int a_size, const byte x); +void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int a_size, const byte x); void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x); void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A); void G2_vector_write_bytes(byte* out, const E2* A, const int len); BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len); -bool_t verify_share(const Fr* x, const E2* y); +bool_t G2_check_log(const Fr* x, const E2* y); #endif From 6f044c26166a2b2a6550687f52e9f3d735702a4a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 18 Apr 2023 17:49:47 -0600 Subject: [PATCH 037/200] renaming and add Fr_generate_poly function --- crypto/bls.go | 2 +- crypto/bls12381_utils.c | 4 +- crypto/bls12381_utils.go | 10 ++--- crypto/bls12381_utils.h | 2 +- crypto/bls_thresholdsign.go | 4 -- crypto/bls_thresholdsign_include.h | 3 +- crypto/blst_include.h | 1 + crypto/blst_src/blst_src.c | 3 +- crypto/dkg_core.c | 62 ++++++++++++++++++++++++++++-- crypto/dkg_feldmanvss.go | 5 --- crypto/dkg_include.h | 1 + 11 files changed, 74 insertions(+), 23 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index d45ea7f3aeb..34281e0aab5 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -294,7 +294,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { defer overwrite(okm) // overwrite okm // map the bytes to a private key : SK = OS2IP(OKM) mod r - isZero := mapToZr(&sk.scalar, okm) + isZero := mapToFr(&sk.scalar, okm) if !isZero { return sk, nil } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 9b91e8e0ebd..9518320d051 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -123,6 +123,8 @@ prec_st* init_precomputed_data_BLS12_381() { return bls_prec; } +// ------------------- Utilities + // ------------------- Fr utilities // Montgomery constant R related to the curve order r @@ -372,7 +374,7 @@ static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n) // Reads a scalar from an array and maps it to Fr using modular reduction. // Input is byte-big-endian as used by the external APIs. // It returns true if scalar is zero and false otherwise. -bool map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { +bool_t map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { vec256_from_be_bytes(a, bin, len); return Fr_is_zero(a); } diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 59776fcec5b..636ddbc0824 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -129,7 +129,7 @@ func randFr(x *scalar) error { if err != nil { return errors.New("internal rng failed") } - _ = mapToZr(x, bytes) + _ = mapToFr(x, bytes) return nil } @@ -142,19 +142,19 @@ func randFrStar(x *scalar) error { if err != nil { return errors.New("internal rng failed") } - isZero = mapToZr(x, bytes) + isZero = mapToFr(x, bytes) } return nil } -// mapToZr reads a scalar from a slice of bytes and maps it to Zr. +// mapToFr reads a scalar from a slice of bytes and maps it to Zr. // The resulting element `k` therefore satisfies 0 <= k < r. // It returns true if scalar is zero and false otherwise. -func mapToZr(x *scalar, src []byte) bool { +func mapToFr(x *scalar, src []byte) bool { isZero := C.map_bytes_to_Fr((*C.Fr)(x), (*C.uchar)(&src[0]), (C.int)(len(src))) - return bool(isZero) + return isZero != (C.ulonglong)(0) } // writeScalar writes a scalar in a slice of bytes diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 01f68610603..3e4c84ed43f 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -116,7 +116,7 @@ void Fr_inv_exp_montg(Fr *res, const Fr *a); BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len); BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len); void Fr_write_bytes(uint8_t *bin, const Fr* a); -bool map_bytes_to_Fr(Fr*, const uint8_t*, int); +bool_t map_bytes_to_Fr(Fr*, const uint8_t*, int); // Fp utilities diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 5ff2e3a4550..008fc1d7ae8 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -558,10 +558,6 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, y := make([]pointE2, size) var X0 pointE2 - // seed relic - if err := seedRelic(seed); err != nil { - return nil, nil, nil, fmt.Errorf("seeding relic failed: %w", err) - } // Generate a polynomial P in Fr[X] of degree t a := make([]scalar, threshold+1) if err := randFrStar(&a[0]); err != nil { // non-identity key diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index e39e4a06887..a10f482cceb 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -5,7 +5,8 @@ #include "bls_include.h" -int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int); +int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int); extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x); +extern void Fr_generate_polynomial(Fr* a); #endif diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 65f552d6fae..64b8e4562b8 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -11,6 +11,7 @@ #include "fields.h" #include "consts.h" #include "errors.h" +#include "sha256.h" // types used by the Flow crypto library that are imported from BLST // these type definitions are used as an abstraction from BLST internal types diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c index 4b0732e06e4..dc2d2c40a4e 100644 --- a/crypto/blst_src/blst_src.c +++ b/crypto/blst_src/blst_src.c @@ -1,6 +1,7 @@ // +build relic -#include "keygen.c" +// keygen.c is not included as it is imported by dkg_core and is not needed +// by bls12_381_utils #include "hash_to_field.c" #include "e1.c" #include "map_to_g1.c" diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 9b51c89d32b..3a8356bbbf3 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -3,11 +3,65 @@ #include "dkg_include.h" -#define N_max 250 -#define N_bits_max 8 // log(250) -#define T_max ((N_max-1)/2) +// HKDF is used to extract and expand entropy +// `hkdf_ctx` holds the context of a HKDF instance +#include "keygen.c" // imported here in order to import BLST's `HMAC_SHA256_CTX` +typedef struct { + HMAC_SHA256_CTX hmac_ctx; // HMAC context + byte prk[32]; // pseudo-random key used by HKDF +} hkdf_ctx; -// computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r) +// instanciate a HKDF to extract entropy from `ikm`. +static hkdf_ctx* get_hkdf_ctx(const byte* ikm, const int ikm_len) { + hkdf_ctx* ctx = (hkdf_ctx*) malloc(sizeof(hkdf_ctx)); + HKDF_Extract(ctx->prk, NULL, 0, ikm, ikm_len, 0, &ctx->hmac_ctx); + return ctx; +} + +// expand entropy from a HKDF instance +static void expand_entropy(byte* dest, const int len, hkdf_ctx* ctx) { + HKDF_Expand(dest, len, ctx->prk, NULL, 0, 0, &ctx->hmac_ctx); +} + +// generate a polynomial P = a_0 + a_1*x + .. + a_n x^n in F_r +// where degree `n` is input `degree` (higher degree monomial in non-zero). +// P also guarantees `a_0` is non zero (for single dealer BLS-DKGs, this insures +// protocol public key output is not identity). +// +// `seed` is used as the source of entropy of the secret polynomial. +// `seed_len` is required to be at least 16, and it is not checked in the function. +void Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const int seed_len) { + // use HKDF to expand `seed` into the needed bytes + hkdf_ctx* ctx = get_hkdf_ctx(seed, seed_len); + // bytes of each coefficient a_i + // use extra 128 bits to reduce the modular reduction bias (128 is half of Fr_BYTES) + const int coef_bytes_len = Fr_BYTES + Fr_BYTES/2; + byte coef_bytes[coef_bytes_len]; + + // generate a_0 in F_r* + bool_t is_zero = 1; + while (is_zero) { + expand_entropy(coef_bytes, coef_bytes_len, ctx); + is_zero = map_bytes_to_Fr(&a[0], coef_bytes, coef_bytes_len); + } + + if (degree > 1) { + // genarate a_i on F_r, for 0 Date: Tue, 18 Apr 2023 18:00:57 -0600 Subject: [PATCH 038/200] update C polynomial headers to use degree --- crypto/bls_thresholdsign.go | 2 +- crypto/dkg_core.c | 23 +++++++++++------------ crypto/dkg_feldmanvss.go | 4 ++-- crypto/dkg_include.h | 6 +++--- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 008fc1d7ae8..8f28d048b63 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -578,7 +578,7 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, C.Fr_polynomial_image( (*C.Fr)(&x[i-1]), (*C.E2)(&y[i-1]), - (*C.Fr)(&a[0]), (C.int)(len(a)), + (*C.Fr)(&a[0]), (C.int)(len(a)-1), (C.uint8_t)(i), ) } diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 3a8356bbbf3..a8c0c976382 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -62,27 +62,26 @@ void Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const in } // computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r -// r being the order of G1, -// and writes P(x) in out and P(x).g2 in y if y is non NULL -// x being a small integer (byte). -void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int a_size, const byte x){ +// where `x` is a small integer (byte) and `degree` is P's degree n. +// P(x) is written in `out` and P(x).g2 is written in `y` if `y` is non NULL. +void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int degree, const byte x){ Fr image; - Fr_polynomial_image(&image, y, a, a_size, x); + Fr_polynomial_image(&image, y, a, degree, x); // exports the result Fr_write_bytes(out, &image); } // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. -// a_i are all in Fr, `a_size` - 1 is P's degree, x is a small integer less than 255. +// a_i are all in Fr, `degree` is P's degree, x is a small integer less than 255. // The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL -void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int a_size, const byte x){ +void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int degree, const byte x){ Fr_set_zero(image); // convert `x` to Montgomery form Fr xR; Fr_set_limb(&xR, (limb_t)x); Fr_to_montg(&xR, &xR); - for (int i = a_size-1; i >= 0; i--) { + for (int i = degree; i >= 0; i--) { Fr_mul_montg(image, image, &xR); Fr_add(image, image, &a[i]); // image is in normal form } @@ -94,9 +93,9 @@ void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int a_size, const // computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 // and stores the point in y -static void E2_polynomial_image(E2* y, const E2* A, const int len_A, const byte x){ +static void E2_polynomial_image(E2* y, const E2* A, const int degree, const byte x){ E2_set_infty(y); - for (int i = len_A-1; i >= 0 ; i--) { + for (int i = degree; i >= 0 ; i--) { E2_mult_small_expo(y, y, x); // TODO: to bench against a specific version of mult with 8 bits expo E2_add(y, y, &A[i]); } @@ -105,10 +104,10 @@ static void E2_polynomial_image(E2* y, const E2* A, const int len_A, const byte // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y) // where Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2[X] -void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int len_A) { +void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int degree) { for (byte i=0; i Date: Wed, 19 Apr 2023 14:08:55 -0600 Subject: [PATCH 039/200] use pseudo-random randFr, randFrStar and FrPolynomial --- crypto/bls12381_utils.go | 33 ++++----- crypto/bls12381_utils_test.go | 10 ++- crypto/bls_thresholdsign.go | 19 ++--- crypto/bls_thresholdsign_include.h | 5 +- crypto/bls_thresholdsign_test.go | 109 +++++++++++++++-------------- crypto/dkg_core.c | 59 ---------------- crypto/dkg_feldmanvss.go | 77 ++++++++++++++------ crypto/dkg_include.h | 1 - 8 files changed, 142 insertions(+), 171 deletions(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 636ddbc0824..0756f09472e 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -14,8 +14,9 @@ package crypto // #include "bls12381_utils.h" import "C" import ( - "crypto/rand" "errors" + + "github.com/onflow/flow-go/crypto/random" ) // Go wrappers around BLST C types @@ -122,29 +123,25 @@ func (p *pointE2) isInfinity() bool { return C.E2_is_infty((*C.E2)(p)) != 0 } -// returns a random element of Fr in input pointer -func randFr(x *scalar) error { +// generates a random element in F_r using input random source, +// and saves the random in `x`. +// returns `true` if generated element is zero. +func randFr(x *scalar, rand random.Rand) bool { + // use extra 128 bits to reduce the modular reduction bias bytes := make([]byte, frBytesLen+securityBits/8) - _, err := rand.Read(bytes) // checking one output is enough - if err != nil { - return errors.New("internal rng failed") - } - _ = mapToFr(x, bytes) - return nil + rand.Read(bytes) // checking one output is enough + // modular reduction + return mapToFr(x, bytes) } -// writes a random element of Fr* in input pointer -func randFrStar(x *scalar) error { - bytes := make([]byte, frBytesLen+securityBits/8) +// generates a random element in F_r* using input random source, +// and saves the random in `x`. +func randFrStar(x *scalar, rand random.Rand) { isZero := true + // exteremely unlikely this loop runs more than once for isZero { - _, err := rand.Read(bytes) // checking one output is enough - if err != nil { - return errors.New("internal rng failed") - } - isZero = mapToFr(x, bytes) + isZero = randFr(x, rand) } - return nil } // mapToFr reads a scalar from a slice of bytes and maps it to Zr. diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index cf0c37d7856..51eaa744284 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -10,16 +10,20 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/onflow/flow-go/crypto/random" ) // G1 and G2 scalar multiplication func BenchmarkScalarMultG1G2(b *testing.B) { - seed := make([]byte, securityBits/8) + seed := make([]byte, random.Chacha20SeedLen) _, _ = rand.Read(seed) - _ = seedRelic(seed) + prg, err := random.NewChacha20PRG(seed, nil) + require.NoError(b, err) + var expo scalar - _ = randFr(&expo) + _ = randFr(&expo, prg) // G1 generator multiplication b.Run("G1 gen", func(b *testing.B) { diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 8f28d048b63..1d19ca42504 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -533,11 +533,13 @@ func EnoughShares(threshold int, sharesNumber int) (bool, error) { // // The function returns : // - (nil, nil, nil, invalidInputsErrorf) if: +// - seed is too short // - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] // - threshold value is not in interval [1, n-1] // - (groupPrivKey, []pubKeyShares, groupPubKey, nil) otherwise func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, []PublicKey, PublicKey, error) { + if size < ThresholdSignMinSize || size > ThresholdSignMaxSize { return nil, nil, nil, invalidInputsErrorf( "size should be between %d and %d, got %d", @@ -559,20 +561,11 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, var X0 pointE2 // Generate a polynomial P in Fr[X] of degree t - a := make([]scalar, threshold+1) - if err := randFrStar(&a[0]); err != nil { // non-identity key - return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err) - } - if threshold > 0 { - for i := 1; i < threshold; i++ { - if err := randFr(&a[i]); err != nil { - return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err) - } - } - if err := randFrStar(&a[threshold]); err != nil { // enforce the polynomial degree - return nil, nil, nil, fmt.Errorf("generating the random polynomial failed: %w", err) - } + a, err := generateFrPolynomial(seed, threshold) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to generate random polynomial: %w", err) } + // compute the shares for i := index(1); int(i) <= size; i++ { C.Fr_polynomial_image( diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index a10f482cceb..ce88c460f95 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -5,8 +5,7 @@ #include "bls_include.h" -int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int); -extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x); -extern void Fr_generate_polynomial(Fr* a); +int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int); +extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x); #endif diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 6d873da6e68..f04b199732b 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -21,8 +21,8 @@ func TestBLSThresholdSignature(t *testing.T) { t.Run("centralized_stateless_keygen", testCentralizedStatelessAPI) // stateful API t.Run("centralized_stateful_keygen", testCentralizedStatefulAPI) - //t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS) - //t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case + t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS) + t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case } const thresholdSignatureTag = "random tag" @@ -546,67 +546,68 @@ type statelessKeys struct { // Centralized test of threshold signature protocol using the threshold key generation. func testCentralizedStatelessAPI(t *testing.T) { n := 10 - for threshold := MinimumThreshold; threshold < n; threshold++ { - // generate threshold keys - r := time.Now().UnixNano() - mrand.Seed(r) - t.Log(r) - seed := make([]byte, SeedMinLenDKG) - _, err := mrand.Read(seed) - require.NoError(t, err) - skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed) - require.NoError(t, err) - // signature hasher - kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag) - // generate signature shares - signShares := make([]Signature, 0, n) - signers := make([]int, 0, n) - // fill the signers list and shuffle it - for i := 0; i < n; i++ { - signers = append(signers, i) - } - mrand.Shuffle(n, func(i, j int) { - signers[i], signers[j] = signers[j], signers[i] - }) - // create (t+1) signatures of the first randomly chosen signers - for j := 0; j < threshold+1; j++ { - i := signers[j] - share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac) - require.NoError(t, err) - assert.True(t, verif, "signature share is not valid") - if verif { - signShares = append(signShares, share) - } - } - // reconstruct and test the threshold signature - thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) + threshold := 6 + //for threshold := MinimumThreshold; threshold < n; threshold++ { + // generate threshold keys + r := time.Now().UnixNano() + mrand.Seed(r) + t.Log(r) + seed := make([]byte, SeedMinLenDKG) + _, err := mrand.Read(seed) + require.NoError(t, err) + skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed) + require.NoError(t, err) + // signature hasher + kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag) + // generate signature shares + signShares := make([]Signature, 0, n) + signers := make([]int, 0, n) + // fill the signers list and shuffle it + for i := 0; i < n; i++ { + signers = append(signers, i) + } + mrand.Shuffle(n, func(i, j int) { + signers[i], signers[j] = signers[j], signers[i] + }) + // create (t+1) signatures of the first randomly chosen signers + for j := 0; j < threshold+1; j++ { + i := signers[j] + share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) require.NoError(t, err) - verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac) + verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac) require.NoError(t, err) assert.True(t, verif, "signature share is not valid") - - // check failure with a random redundant signer - if threshold > 1 { - randomDuplicate := mrand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold - tmp := signers[randomDuplicate] - signers[randomDuplicate] = signers[0] - thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) - assert.Error(t, err) - assert.True(t, IsDuplicatedSignerError(err)) - assert.Nil(t, thresholdSignature) - signers[randomDuplicate] = tmp + if verif { + signShares = append(signShares, share) } + } + // reconstruct and test the threshold signature + thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) + require.NoError(t, err) + verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac) + require.NoError(t, err) + assert.True(t, verif, "signature share is not valid") - // check with an invalid signature (invalid serialization) - invalidSig := make([]byte, signatureLengthBLSBLS12381) - signShares[0] = invalidSig + // check failure with a random redundant signer + if threshold > 1 { + randomDuplicate := mrand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold + tmp := signers[randomDuplicate] + signers[randomDuplicate] = signers[0] thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) assert.Error(t, err) - assert.True(t, IsInvalidSignatureError(err)) + assert.True(t, IsDuplicatedSignerError(err)) assert.Nil(t, thresholdSignature) + signers[randomDuplicate] = tmp } + + // check with an invalid signature (invalid serialization) + invalidSig := make([]byte, signatureLengthBLSBLS12381) + signShares[0] = invalidSig + thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) + assert.Error(t, err) + assert.True(t, IsInvalidSignatureError(err)) + assert.Nil(t, thresholdSignature) + //} } func BenchmarkSimpleKeyGen(b *testing.B) { diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index a8c0c976382..48d1f72f752 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -2,65 +2,6 @@ #include "dkg_include.h" - -// HKDF is used to extract and expand entropy -// `hkdf_ctx` holds the context of a HKDF instance -#include "keygen.c" // imported here in order to import BLST's `HMAC_SHA256_CTX` -typedef struct { - HMAC_SHA256_CTX hmac_ctx; // HMAC context - byte prk[32]; // pseudo-random key used by HKDF -} hkdf_ctx; - -// instanciate a HKDF to extract entropy from `ikm`. -static hkdf_ctx* get_hkdf_ctx(const byte* ikm, const int ikm_len) { - hkdf_ctx* ctx = (hkdf_ctx*) malloc(sizeof(hkdf_ctx)); - HKDF_Extract(ctx->prk, NULL, 0, ikm, ikm_len, 0, &ctx->hmac_ctx); - return ctx; -} - -// expand entropy from a HKDF instance -static void expand_entropy(byte* dest, const int len, hkdf_ctx* ctx) { - HKDF_Expand(dest, len, ctx->prk, NULL, 0, 0, &ctx->hmac_ctx); -} - -// generate a polynomial P = a_0 + a_1*x + .. + a_n x^n in F_r -// where degree `n` is input `degree` (higher degree monomial in non-zero). -// P also guarantees `a_0` is non zero (for single dealer BLS-DKGs, this insures -// protocol public key output is not identity). -// -// `seed` is used as the source of entropy of the secret polynomial. -// `seed_len` is required to be at least 16, and it is not checked in the function. -void Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const int seed_len) { - // use HKDF to expand `seed` into the needed bytes - hkdf_ctx* ctx = get_hkdf_ctx(seed, seed_len); - // bytes of each coefficient a_i - // use extra 128 bits to reduce the modular reduction bias (128 is half of Fr_BYTES) - const int coef_bytes_len = Fr_BYTES + Fr_BYTES/2; - byte coef_bytes[coef_bytes_len]; - - // generate a_0 in F_r* - bool_t is_zero = 1; - while (is_zero) { - expand_entropy(coef_bytes, coef_bytes_len, ctx); - is_zero = map_bytes_to_Fr(&a[0], coef_bytes, coef_bytes_len); - } - - if (degree > 1) { - // genarate a_i on F_r, for 0 0 { + // genarate a_i on F_r, for 0 0 { - for i := 1; i < s.threshold; i++ { - if err := randFr(&s.a[i]); err != nil { - return fmt.Errorf("generating the polynomial failed: %w", err) - } - generatorScalarMultG2(&s.vA[i], &s.a[i]) - } - // non-zero a[t] to enforce the polynomial degree - if err := randFrStar(&s.a[s.threshold]); err != nil { - return fmt.Errorf("generating the polynomial failed: %w", err) - } - generatorScalarMultG2(&s.vA[s.threshold], &s.a[s.threshold]) + + // Generate a random polyomial P in Fr[X] of degree t (coefficients are a_i) + // `s.a` are the coefficients of P + // - a_degree is non-zero as deg(P) = degree + // - `a_0` is non-zero to make sure BLS-DKG public key is non-identity + var err error + s.a, err = generateFrPolynomial(seed, s.threshold) + if err != nil { + return fmt.Errorf("failed to generate random polynomial: %w", err) + } + + // compute the verification vector A_i = g2^a_i + s.vA = make([]pointE2, s.threshold+1) + for i := 0; i <= s.threshold; i++ { + generatorScalarMultG2(&s.vA[i], &s.a[i]) } // compute the shares diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index 6e5c9241638..e8489fbf669 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -5,7 +5,6 @@ #include "bls12381_utils.h" -void Fr_generate_polynomial(Fr* a, const int degree, const byte* seed, const int seed_len); void Fr_polynomial_image_write(byte* out, E2* y, const Fr* a, const int deg, const byte x); void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int deg, const byte x); void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int deg); From 3d26f0cf870bb3cb6f7b213e5e0bdc026edfc94d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 19 Apr 2023 14:24:00 -0600 Subject: [PATCH 040/200] unify seed lengths of DKG and other keyGen seed lengths --- crypto/bls.go | 3 +- crypto/bls12381_utils_test.go | 9 +-- crypto/bls_thresholdsign_test.go | 119 +++++++++++++++---------------- crypto/dkg.go | 3 - crypto/dkg_feldmanvss.go | 6 +- crypto/dkg_test.go | 6 +- 6 files changed, 70 insertions(+), 76 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 34281e0aab5..0cec3458bbf 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -293,7 +293,8 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { } defer overwrite(okm) // overwrite okm - // map the bytes to a private key : SK = OS2IP(OKM) mod r + // map the bytes to a private key using modular reduction + // SK = OS2IP(OKM) mod r isZero := mapToFr(&sk.scalar, okm) if !isZero { return sk, nil diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 51eaa744284..ed72a5ec84b 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -10,20 +10,17 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - - "github.com/onflow/flow-go/crypto/random" ) // G1 and G2 scalar multiplication func BenchmarkScalarMultG1G2(b *testing.B) { - seed := make([]byte, random.Chacha20SeedLen) - _, _ = rand.Read(seed) - prg, err := random.NewChacha20PRG(seed, nil) + seed := make([]byte, frBytesLen) + _, err := rand.Read(seed) require.NoError(b, err) var expo scalar - _ = randFr(&expo, prg) + _ = mapToFr(&expo, seed) // G1 generator multiplication b.Run("G1 gen", func(b *testing.B) { diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index f04b199732b..dfcba3ecccb 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -37,7 +37,7 @@ func testCentralizedStatefulAPI(t *testing.T) { n := 10 for threshold := MinimumThreshold; threshold < n; threshold++ { // generate threshold keys - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) _, err := mrand.Read(seed) require.NoError(t, err) skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed) @@ -346,9 +346,9 @@ func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) { chans[i] = make(chan *message, 2*n) } // start DKG in all participants - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) read, err := rand.Read(seed) - require.Equal(t, read, SeedMinLenDKG) + require.Equal(t, read, KeyGenSeedMinLen) require.NoError(t, err) sync.Add(n) for current := 0; current < n; current++ { @@ -405,9 +405,9 @@ func testDistributedStatefulAPI_JointFeldman(t *testing.T) { chans[i] = make(chan *message, 2*n) } // start DKG in all participants but the - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) read, err := rand.Read(seed) - require.Equal(t, read, SeedMinLenDKG) + require.Equal(t, read, KeyGenSeedMinLen) require.NoError(t, err) sync.Add(n) for current := 0; current < n; current++ { @@ -546,73 +546,72 @@ type statelessKeys struct { // Centralized test of threshold signature protocol using the threshold key generation. func testCentralizedStatelessAPI(t *testing.T) { n := 10 - threshold := 6 - //for threshold := MinimumThreshold; threshold < n; threshold++ { - // generate threshold keys - r := time.Now().UnixNano() - mrand.Seed(r) - t.Log(r) - seed := make([]byte, SeedMinLenDKG) - _, err := mrand.Read(seed) - require.NoError(t, err) - skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed) - require.NoError(t, err) - // signature hasher - kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag) - // generate signature shares - signShares := make([]Signature, 0, n) - signers := make([]int, 0, n) - // fill the signers list and shuffle it - for i := 0; i < n; i++ { - signers = append(signers, i) - } - mrand.Shuffle(n, func(i, j int) { - signers[i], signers[j] = signers[j], signers[i] - }) - // create (t+1) signatures of the first randomly chosen signers - for j := 0; j < threshold+1; j++ { - i := signers[j] - share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) + for threshold := MinimumThreshold; threshold < n; threshold++ { + // generate threshold keys + r := time.Now().UnixNano() + mrand.Seed(r) + t.Log(r) + seed := make([]byte, KeyGenSeedMinLen) + _, err := mrand.Read(seed) + require.NoError(t, err) + skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed) + require.NoError(t, err) + // signature hasher + kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag) + // generate signature shares + signShares := make([]Signature, 0, n) + signers := make([]int, 0, n) + // fill the signers list and shuffle it + for i := 0; i < n; i++ { + signers = append(signers, i) + } + mrand.Shuffle(n, func(i, j int) { + signers[i], signers[j] = signers[j], signers[i] + }) + // create (t+1) signatures of the first randomly chosen signers + for j := 0; j < threshold+1; j++ { + i := signers[j] + share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) + require.NoError(t, err) + verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac) + require.NoError(t, err) + assert.True(t, verif, "signature share is not valid") + if verif { + signShares = append(signShares, share) + } + } + // reconstruct and test the threshold signature + thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) require.NoError(t, err) - verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac) + verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac) require.NoError(t, err) assert.True(t, verif, "signature share is not valid") - if verif { - signShares = append(signShares, share) + + // check failure with a random redundant signer + if threshold > 1 { + randomDuplicate := mrand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold + tmp := signers[randomDuplicate] + signers[randomDuplicate] = signers[0] + thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) + assert.Error(t, err) + assert.True(t, IsDuplicatedSignerError(err)) + assert.Nil(t, thresholdSignature) + signers[randomDuplicate] = tmp } - } - // reconstruct and test the threshold signature - thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) - require.NoError(t, err) - verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac) - require.NoError(t, err) - assert.True(t, verif, "signature share is not valid") - // check failure with a random redundant signer - if threshold > 1 { - randomDuplicate := mrand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold - tmp := signers[randomDuplicate] - signers[randomDuplicate] = signers[0] + // check with an invalid signature (invalid serialization) + invalidSig := make([]byte, signatureLengthBLSBLS12381) + signShares[0] = invalidSig thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) assert.Error(t, err) - assert.True(t, IsDuplicatedSignerError(err)) + assert.True(t, IsInvalidSignatureError(err)) assert.Nil(t, thresholdSignature) - signers[randomDuplicate] = tmp } - - // check with an invalid signature (invalid serialization) - invalidSig := make([]byte, signatureLengthBLSBLS12381) - signShares[0] = invalidSig - thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) - assert.Error(t, err) - assert.True(t, IsInvalidSignatureError(err)) - assert.Nil(t, thresholdSignature) - //} } func BenchmarkSimpleKeyGen(b *testing.B) { n := 60 - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) _, _ = rand.Read(seed) b.ResetTimer() for i := 0; i < b.N; i++ { @@ -623,7 +622,7 @@ func BenchmarkSimpleKeyGen(b *testing.B) { func BenchmarkSignatureReconstruction(b *testing.B) { n := 60 - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) _, _ = rand.Read(seed) threshold := 40 // generate threshold keys diff --git a/crypto/dkg.go b/crypto/dkg.go index 1cdf87a128e..03305d016c7 100644 --- a/crypto/dkg.go +++ b/crypto/dkg.go @@ -34,9 +34,6 @@ const ( DKGMinSize int = MinimumThreshold + 1 // DKGMaxSize is the maximum size of a group participating in a DKG protocol DKGMaxSize int = 254 - // SeedMinLenDKG is the minumum seed length required to participate in a DKG protocol - SeedMinLenDKG = securityBits / 8 - SeedMaxLenDKG = maxRelicPrgSeed ) type DKGState interface { diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index f247b9bc491..64f2a11c383 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -263,12 +263,12 @@ func (s *feldmanVSSstate) ForceDisqualify(participant int) error { // where `n` is the input `degree` (higher degree monomial in non-zero). // `a_0` is also non-zero (for single dealer BLS-DKGs, this insures // protocol public key output is not identity). -// `seed` is used as the entropy source and must be at least `SeedMinLenDKG` +// `seed` is used as the entropy source and must be at least `KeyGenSeedMinLen` // random bytes with at least 128 bits entropy. func generateFrPolynomial(seed []byte, degree int) ([]scalar, error) { - if len(seed) < SeedMinLenDKG { + if len(seed) < KeyGenSeedMinLen { return nil, invalidInputsErrorf( - "seed should be at least %d bytes, got %d", SeedMinLenDKG, len(seed)) + "seed should be at least %d bytes, got %d", KeyGenSeedMinLen, len(seed)) } // build a PRG out of the seed diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index a35d259f4f2..fc8f730e779 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -293,9 +293,9 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) { // start DKG in all participants // start listening on the channels - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) read, err := mrand.Read(seed) - require.Equal(t, read, SeedMinLenDKG) + require.Equal(t, read, KeyGenSeedMinLen) require.NoError(t, err) sync.Add(n) @@ -771,7 +771,7 @@ func TestDKGTransitionErrors(t *testing.T) { threshold := 3 myIndex := 0 dealer := 1 - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) t.Run("feldman VSS", func(t *testing.T) { state, err := NewFeldmanVSS(n, threshold, myIndex, dummyTestDKGProcessor{}, dealer) From 768602c176312dad7d8a5b441f71d4fa81d0c0a2 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 20 Apr 2023 00:58:14 -0600 Subject: [PATCH 041/200] update randG2 to map to G2 and update membership check in G2 tests --- crypto/bls12381_utils.c | 93 ++++++++++++++++++--------------- crypto/bls12381_utils.go | 34 ++++-------- crypto/bls12381_utils.h | 20 +++---- crypto/bls12381_utils_test.go | 67 +++++++++++++++--------- crypto/bls_thresholdsign_core.c | 4 +- 5 files changed, 112 insertions(+), 106 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 9518320d051..9ab5fb58d91 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -53,16 +53,6 @@ ctx_t* relic_init_BLS12_381() { return core_get(); } -// seeds relic PRG -void seed_relic(byte* seed, int len) { - #if RAND == HASHD - // instantiate a new DRBG - ctx_t *ctx = core_get(); - ctx->seeded = 0; - #endif - rand_seed(seed, len); -} - // global variable of the pre-computed data prec_st bls_prec_st; prec_st* bls_prec = NULL; @@ -128,7 +118,7 @@ prec_st* init_precomputed_data_BLS12_381() { // ------------------- Fr utilities // Montgomery constant R related to the curve order r -const limb_t BLS12_381_rR[Fr_LIMBS] = { /* (1<<256)%r */ +const Fr BLS12_381_rR = { /* R mod r = (1<<256)%r */ TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe) }; @@ -346,7 +336,7 @@ void Fr_write_bytes(uint8_t *bin, const Fr* a) { // maps big-endian bytes into an Fr element using modular reduction // Input is byte-big-endian, output is vec256 (also used as Fr) -static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n) +static void Fr_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n) { Fr digit, radix; Fr_set_zero(out); @@ -375,14 +365,14 @@ static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n) // Input is byte-big-endian as used by the external APIs. // It returns true if scalar is zero and false otherwise. bool_t map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { - vec256_from_be_bytes(a, bin, len); + Fr_from_be_bytes(a, bin, len); return Fr_is_zero(a); } // ------------------- Fp utilities -// Montgomery constant R related to the prime p -const limb_t BLS12_381_pR[Fp_LIMBS] = { ONE_MONT_P }; /* (1<<384)%p */ +// Montgomery constants related to the prime p +const Fp BLS12_381_pR = { ONE_MONT_P }; /* R mod p = (1<<384)%p */ // sets `a` to 0 void Fp_set_zero(Fp* a){ @@ -1248,14 +1238,21 @@ int bowe_subgroup_check_G1(const ep_t p){ } #endif -// generates a random point in G1 and stores it in p -void ep_rand_G1(ep_t p) { +/* +// maps the bytes to a point in G1 +// this is a testing file only, should not be used in any protocol! +void map_bytes_to_G1(ep_t p, const uint8_t* bytes, int len) { + // map to Fr + Fr log; + map_bytes_to_Fr(&log, bytes, len); // multiplies G1 generator by a random scalar - ep_rand(p); + + } -// generates a random point in E1\G1 and stores it in p -void ep_rand_G1complement(ep_t p) { +// generates a point in E1\G1 and stores it in p +// this is a testing file only, should not be used in any protocol! +void map_bytes_to_G1complement(ep_t p, const uint8_t* bytes, int len) { // generate a random point in E1 p->coord = BASIC; fp_set_dig(p->z, 1); @@ -1273,32 +1270,46 @@ void ep_rand_G1complement(ep_t p) { assert(ep_on_curve(p)); // sanity check to make sure p is in E1 } +*/ -// generates a random point in G2 and stores it in p -void ep2_rand_G2(ep2_t p) { +// maps the bytes to a point in G2. +// `len` should be at least Fr_BYTES. +// this is a testing tool only, it should not be used in any protocol! +void map_bytes_to_G2(E2* p, const uint8_t* bytes, int len) { + assert(len > Fr_BYTES); + // map to Fr + Fr log; + map_bytes_to_Fr(&log, bytes, len); // multiplies G2 generator by a random scalar - ep2_rand(p); -} - -// generates a random point in E2\G2 and stores it in p -void ep2_rand_G2complement(ep2_t p) { - // generate a random point in E2 - p->coord = BASIC; - fp_set_dig(p->z[0], 1); - fp_zero(p->z[1]); - do { - fp2_rand(p->x); // set x to a random field element - byte r; - rand_bytes(&r, 1); - fp2_zero(p->y); - fp_set_bit(p->y[0], 0, r&1); // set y randomly to 0 or 1 + G2_mult_gen(p, &log); +} + +// attempts to map `bytes` to a point in E2\G2 and stores it in p. +// `len` should be at least G2_SER_BYTES. It returns BLST_SUCCESS only if mapping +// succeeds. +// For now, function only works when E2 serialization is compressed. +// this is a testing tool only, it should not be used in any protocol! +BLST_ERROR map_bytes_to_G2complement(E2* p, const uint8_t* bytes, int len) { + assert(G2_SERIALIZATION == COMPRESSED); + assert(len >= G2_SER_BYTES); + + // attempt to deserilize a compressed E2 point from input bytes + // after fixing the header 2 bits + byte copy[G2_SER_BYTES]; + memcpy(copy, bytes, sizeof(copy)); + copy[0] |= 1<<7; // set compression bit + copy[0] &= ~(1<<6); // clear infinity bit - point is not infinity + + BLST_ERROR ser = E2_read_bytes(p, copy, len); + if (ser != BLST_SUCCESS) { + return ser; } - while (ep2_upk(p, p) == 0); // make sure p is in E1 - // map the point to E1\G1 by clearing G1 order - ep2_mul_basic(p, p, &core_get()->ep_r); + // map the point to E2\G2 by clearing G2 order + E2_mult(p, p, (const Fr*)BLS12_381_r); - assert(ep2_on_curve(p)); // sanity check to make sure p is in E1 + assert(E2_affine_on_curve(p)); // sanity check to make sure p is in E2 + return BLST_SUCCESS; } // This is a testing function. diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 0756f09472e..b6f822a6b1a 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -62,23 +62,6 @@ func (ct *ctx) initContext() error { return nil } -// seeds the internal relic random function. -// relic context must be initialized before seeding. -func seedRelic(seed []byte) error { - if len(seed) < (securityBits / 8) { - return invalidInputsErrorf( - "seed length needs to be larger than %d", - securityBits/8) - } - if len(seed) > maxRelicPrgSeed { - return invalidInputsErrorf( - "seed length needs to be less than %x", - maxRelicPrgSeed) - } - C.seed_relic((*C.uchar)(&seed[0]), (C.int)(len(seed))) - return nil -} - // Exponentiation in G1 (scalar point multiplication) func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) { C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.Fr)(expo)) @@ -250,6 +233,7 @@ func checkMembershipG2(pt *pointE2) int { return int(C.G2_check_membership((*C.E2)(pt))) } +/* // randPointG1 wraps a call to C since cgo can't be used in go test files. // It generates a random point in G1 and stores it in input point. func randPointG1(pt *pointE1) { @@ -261,20 +245,20 @@ func randPointG1(pt *pointE1) { func randPointG1Complement(pt *pointE1) { C.ep_rand_G1complement((*C.ep_st)(pt)) } +*/ -/* -// randPointG2 wraps a call to C since cgo can't be used in go test files. +// mapToG2 wraps a call to C since cgo can't be used in go test files. // It generates a random point in G2 and stores it in input point. -func randPointG2(pt *pointE2) { - C.ep2_rand_G2((*C.E2)(pt)) +func mapToG2(pt *pointE2, src []byte) { + C.map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&src[0]), (C.int)(len(src))) } -// randPointG1Complement wraps a call to C since cgo can't be used in go test files. +// mapToG2Complement wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. -func randPointG2Complement(pt *pointE2) { - C.ep2_rand_G2complement((*C.E2)(pt)) +func mapToG2Complement(pt *pointE2, src []byte) bool { + res := C.map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&src[0]), (C.int)(len(src))) + return int(res) == blst_valid } -*/ // This is only a TEST function. // It hashes `data` to a G1 point using the tag `dst` and returns the G1 point serialization. diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 3e4c84ed43f..831bce5c62f 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -96,7 +96,7 @@ int bls_spock_verify(const E2*, const byte*, const E2*, const byte*); void map_to_G1(ep_t, const byte*, const int); // Fr utilities -extern const limb_t BLS12_381_rR[Fr_LIMBS]; +extern const Fr BLS12_381_rR; bool_t Fr_is_zero(const Fr* a); bool_t Fr_is_equal(const Fr* a, const Fr* b); void Fr_set_limb(Fr*, const limb_t); @@ -130,8 +130,8 @@ void ep_sum_vector(ep_t, ep_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); int G1_check_membership(const ep_t); int G1_simple_subgroup_check(const ep_t); -void ep_rand_G1(ep_t); -void ep_rand_G1complement( ep_t); +void map_bytes_to_G1(E1*, const uint8_t*, int); +void map_bytes_to_G1complement(E1*, const uint8_t*, int); #if (MEMBERSHIP_CHECK_G1 == BOWE) int bowe_subgroup_check_G1(const ep_t); #endif @@ -150,20 +150,16 @@ void E2_mult(E2*, const E2*, const Fr*); void E2_mult_small_expo(E2*, const E2*, const byte); void E2_add(E2* res, const E2* a, const E2* b); void E2_sum_vector(E2*, const E2*, const int); - -void ep2_mult(ep2_t res, const ep2_t p, const Fr* expo); - -void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); -int G2_check_membership(const E2*); -int G2_simple_subgroup_check(const ep2_t); -void ep2_rand_G2(ep2_t); -void ep2_rand_G2complement( ep2_t); +void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); +int G2_check_membership(const E2*); +int G2_simple_subgroup_check(const ep2_t); +void map_bytes_to_G2(E2*, const uint8_t*, int); +BLST_ERROR map_bytes_to_G2complement(E2*, const uint8_t*, int); // Utility functions ctx_t* relic_init_BLS12_381(); prec_st* init_precomputed_data_BLS12_381(); void precomputed_data_set(const prec_st* p); -void seed_relic(byte*, int); // utility testing function void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index ed72a5ec84b..7389dfa1454 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -6,7 +6,9 @@ package crypto import ( "crypto/rand" "encoding/hex" + mrand "math/rand" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -102,10 +104,9 @@ func BenchmarkMapToG1(b *testing.B) { // test subgroup membership check in G1 and G2 func TestSubgroupCheck(t *testing.T) { - // seed Relic PRG - seed := make([]byte, securityBits/8) - _, _ = rand.Read(seed) - _ = seedRelic(seed) + r := time.Now().UnixNano() + mrand.Seed(r) + t.Logf("math rand seed is %d", r) /*t.Run("G1", func(t *testing.T) { var p pointE1 @@ -115,24 +116,34 @@ func TestSubgroupCheck(t *testing.T) { randPointG1Complement(&p) // point in E1\G1 res = checkMembershipG1(&p) assert.Equal(t, res, int(invalid)) + })*/ + + t.Run("G2", func(t *testing.T) { + t.Skip() // TODO: fix membership check in G2 and update + var p pointE2 + seed := make([]byte, PubKeyLenBLSBLS12381) + _, err := mrand.Read(seed) + require.NoError(t, err) + mapToG2(&p, seed) // point in G2 + res := checkMembershipG2(&p) + assert.Equal(t, res, int(valid)) + + inG2 := false + for !inG2 { + _, err := mrand.Read(seed) + require.NoError(t, err) + inG2 = mapToG2Complement(&p, seed) // point in E2\G2 + } + res = checkMembershipG2(&p) + assert.Equal(t, res, int(invalid)) }) - t.Run("G2", func(t *testing.T) { - var p pointE2 - randPointG2(&p) // point in G2 - res := checkMembershipG2(&p) - assert.Equal(t, res, int(valid)) - randPointG2Complement(&p) // point in E2\G2 - res = checkMembershipG2(&p) - assert.Equal(t, res, int(invalid)) - }) - */ } // subgroup membership check bench func BenchmarkSubgroupCheck(b *testing.B) { - b.Run("G1", func(b *testing.B) { + /*b.Run("G1", func(b *testing.B) { var p pointE1 randPointG1(&p) b.ResetTimer() @@ -140,16 +151,20 @@ func BenchmarkSubgroupCheck(b *testing.B) { _ = checkMembershipG1(&p) // G1 } b.StopTimer() + })*/ + + b.Run("G2", func(b *testing.B) { + var p pointE2 + seed := make([]byte, PubKeyLenBLSBLS12381) + _, err := mrand.Read(seed) + require.NoError(b, err) + mapToG2(&p, seed) // point in G2 + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = checkMembershipG2(&p) // G2 + } + b.StopTimer() }) - /* - b.Run("G2", func(b *testing.B) { - var p pointE2 - randPointG2(&p) - b.ResetTimer() - for i := 0; i < b.N; i++ { - _ = checkMembershipG2(&p) // G2 - } - b.StopTimer() - }) - */ + } diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index 96d07f2a42e..777af1ef5e9 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -16,8 +16,8 @@ static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indice Fr denominator; // eventually would represent D*R^k // Initialize N and D to Montgomery constant R - Fr_copy(&numerator, (Fr*)BLS12_381_rR); - Fr_copy(&denominator, (Fr*)BLS12_381_rR); + Fr_copy(&numerator, &BLS12_381_rR); + Fr_copy(&denominator, &BLS12_381_rR); // sign of D: 0 for positive and 1 for negative int sign = 0; From 21676845b5f728c4e2ebc9d1bb052d4ecbf7b8f2 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 20 Apr 2023 11:48:22 -0600 Subject: [PATCH 042/200] membership check in G2 using BLST --- crypto/bls.go | 4 ++-- crypto/bls12381_utils.c | 27 +++++++++---------------- crypto/bls12381_utils.go | 9 +++++---- crypto/bls12381_utils.h | 5 ++--- crypto/bls12381_utils_test.go | 8 ++------ crypto/bls_core.c | 38 ++++++++--------------------------- crypto/bls_include.h | 3 --- crypto/blst_src/blst_src.c | 3 +-- 8 files changed, 29 insertions(+), 68 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 0cec3458bbf..5cc78190d8a 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -293,7 +293,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { } defer overwrite(okm) // overwrite okm - // map the bytes to a private key using modular reduction + // map the bytes to a private key using modular reduction // SK = OS2IP(OKM) mod r isZero := mapToFr(&sk.scalar, okm) if !isZero { @@ -353,7 +353,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err } // membership check in G2 - if C.G2_check_membership((*C.E2)(&pk.point)) != valid { + if C.E2_in_G2((*C.E2)(&pk.point)) == (C.ulonglong)(0) { return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group") } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 9ab5fb58d91..855d61ad7ac 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1013,6 +1013,13 @@ void G2_mult_gen(E2* res, const Fr* expo) { POINTonE2_sign((POINTonE2*)res, &BLS12_381_G2, tmp); } +// checks if input E2 point is on the subgroup G2. +// It assumes input `p` is on E2. +bool_t E2_in_G2(const E2* p){ + // currently uses Scott method + return POINTonE2_in_G2((const POINTonE2*)p); +} + // computes the sum of the G2 array elements y and writes the sum in jointy void E2_sum_vector(E2* jointy, const E2* y, const int len){ E2_set_infty(jointy); @@ -1040,7 +1047,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* return read_ret; // check s1 is in G1 - if (G1_check_membership(elemsG1[0]) != VALID) // only enabled if MEMBERSHIP_CHECK==1 + if (E1_in_G1(elemsG1[0]) != VALID) return INVALID; // elemsG1[1] = s2 @@ -1050,7 +1057,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* return read_ret; // check s2 in G1 - if (G1_check_membership(elemsG1[1]) != VALID) // only enabled if MEMBERSHIP_CHECK==1 + if (E1_in_G1(elemsG1[1]) != VALID) return INVALID; // elemsG2[1] = pk1 @@ -1166,22 +1173,6 @@ int G1_simple_subgroup_check(const ep_t p){ return VALID; } -// uses a simple scalar multiplication by G1's order -// to check whether a point on the curve E2 is in G2. -int G2_simple_subgroup_check(const ep2_t p){ - ep2_t inf; - ep2_new(inf); - // check p^order == infinity - // use basic double & add as lwnaf reduces the expo modulo r - ep2_mul_basic(inf, (ep2_st*)p, &core_get()->ep_r); - if (!ep2_is_infty(inf)){ - ep2_free(inf); - return INVALID; - } - ep2_free(inf); - return VALID; -} - #if (MEMBERSHIP_CHECK_G1 == BOWE) // beta such that beta^3 == 1 mod p // beta is in the Montgomery form diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index b6f822a6b1a..033bcfcb20f 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -223,14 +223,15 @@ func readPointE1(a *pointE1, src []byte) error { // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used // in go test files. -func checkMembershipG1(pt *pointE1) int { - return int(C.G1_check_membership((*C.ep_st)(pt))) +func checkMembershipG1(pt *pointE1) bool { + //return C.E1_in_G1((*C.E1)(pt)) != (C.ulonglong)(0) + return true } // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used // in go test files. -func checkMembershipG2(pt *pointE2) int { - return int(C.G2_check_membership((*C.E2)(pt))) +func checkMembershipG2(pt *pointE2) bool { + return C.E2_in_G2((*C.E2)(pt)) != (C.ulonglong)(0) } /* diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 831bce5c62f..d29dcf54c63 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -128,7 +128,7 @@ void ep_mult_generic_bench(ep_t, const Fr*); void ep_mult(ep_t, const ep_t, const Fr*); void ep_sum_vector(ep_t, ep_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); -int G1_check_membership(const ep_t); +int E1_in_G1(const ep_t); int G1_simple_subgroup_check(const ep_t); void map_bytes_to_G1(E1*, const uint8_t*, int); void map_bytes_to_G1complement(E1*, const uint8_t*, int); @@ -151,8 +151,7 @@ void E2_mult_small_expo(E2*, const E2*, const byte); void E2_add(E2* res, const E2* a, const E2* b); void E2_sum_vector(E2*, const E2*, const int); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); -int G2_check_membership(const E2*); -int G2_simple_subgroup_check(const ep2_t); +bool_t E2_in_G2(const E2*); void map_bytes_to_G2(E2*, const uint8_t*, int); BLST_ERROR map_bytes_to_G2complement(E2*, const uint8_t*, int); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 7389dfa1454..78d08810a6f 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -119,14 +119,12 @@ func TestSubgroupCheck(t *testing.T) { })*/ t.Run("G2", func(t *testing.T) { - t.Skip() // TODO: fix membership check in G2 and update var p pointE2 seed := make([]byte, PubKeyLenBLSBLS12381) _, err := mrand.Read(seed) require.NoError(t, err) mapToG2(&p, seed) // point in G2 - res := checkMembershipG2(&p) - assert.Equal(t, res, int(valid)) + assert.True(t, checkMembershipG2(&p)) inG2 := false for !inG2 { @@ -134,10 +132,8 @@ func TestSubgroupCheck(t *testing.T) { require.NoError(t, err) inG2 = mapToG2Complement(&p, seed) // point in E2\G2 } - res = checkMembershipG2(&p) - assert.Equal(t, res, int(invalid)) + assert.False(t, checkMembershipG2(&p)) }) - } // subgroup membership check bench diff --git a/crypto/bls_core.c b/crypto/bls_core.c index eae1382e6a1..47e7d270546 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -21,8 +21,9 @@ int get_sk_len() { // Checks if input point p is in the subgroup G1. // The function assumes the input is known to be on the curve E1. -int G1_check_membership(const ep_t p){ -#if MEMBERSHIP_CHECK +int E1_in_G1(const ep_t p){ +// TODO: to upadte +/* #if MEMBERSHIP_CHECK_G1 == EXP_ORDER return G1_simple_subgroup_check(p); #elif MEMBERSHIP_CHECK_G1 == BOWE @@ -31,30 +32,7 @@ int G1_check_membership(const ep_t p){ #else return UNDEFINED; #endif -#endif - return VALID; -} - -// checks if input point s is on the curve E2 -// and is in the subgroup G2. -// -// membership check in G2 is using a scalar multiplication by the group order. -// TODO: switch to the faster Bowe check -int G2_check_membership(const E2* p){ -#if MEMBERSHIP_CHECK - // check p is on curve - if (!E2_affine_on_curve(p)) // TODO: remove and assume inputs are on curve? - return INVALID; - // check p is in G2 - #if MEMBERSHIP_CHECK_G2 == EXP_ORDER - return G2_simple_subgroup_check(p); - #elif MEMBERSHIP_CHECK_G2 == BOWE - // TODO: implement Bowe's check - return UNDEFINED; - #else - return UNDEFINED; - #endif -#endif +*/ return VALID; } @@ -172,7 +150,7 @@ int bls_verifyPerDistinctMessage(const byte* sig, if (ret != RLC_OK) goto out; // check s is in G1 - ret = G1_check_membership(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1 + ret = E1_in_G1(elemsG1[0]); if (ret != VALID) goto out; // elemsG2[0] = -g2 @@ -260,7 +238,7 @@ int bls_verifyPerDistinctKey(const byte* sig, if (ret != RLC_OK) goto out; // check s in G1 - ret = G1_check_membership(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1 + ret = E1_in_G1(elemsG1[0]); if (ret != VALID) goto out; // elemsG2[0] = -g2 @@ -346,7 +324,7 @@ int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) { } // check s is in G1 - if (G1_check_membership(s) != VALID) { // only enabled if MEMBERSHIP_CHECK==1 + if (E1_in_G1(s) != VALID) { return INVALID; } @@ -495,7 +473,7 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input, // - valid points are multiplied by a random scalar (same for public keys at same index) // to make sure a signature at index (i) is verified against the public key at the same index. int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN); - if (read_ret != RLC_OK || G1_check_membership(&sigs[i]) != VALID) { + if (read_ret != RLC_OK || E1_in_G1(&sigs[i]) != VALID) { if (read_ret == UNDEFINED) {// unexpected error case goto out; }; diff --git a/crypto/bls_include.h b/crypto/bls_include.h index f81f2839bcf..f5a6a53a6f7 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -20,9 +20,6 @@ #define DOUBLE_PAIRING 1 #define SINGLE_PAIRING (DOUBLE_PAIRING^1) -// Signature and public key membership check -#define MEMBERSHIP_CHECK 0 // TODO: switch to 1 and clean up memb check - // algorithm choice for hashing to G1 // both methods are similar implementations of the same optimized SSWU // but offer different timings. diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c index dc2d2c40a4e..4b0732e06e4 100644 --- a/crypto/blst_src/blst_src.c +++ b/crypto/blst_src/blst_src.c @@ -1,7 +1,6 @@ // +build relic -// keygen.c is not included as it is imported by dkg_core and is not needed -// by bls12_381_utils +#include "keygen.c" #include "hash_to_field.c" #include "e1.c" #include "map_to_g1.c" From 2d90a7089757864dd3ce340ea6e488620b23821d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 20 Apr 2023 18:08:32 -0600 Subject: [PATCH 043/200] update batch verify random coefficients --- crypto/bls12381_utils.c | 2 -- crypto/bls_core.c | 40 ++++++++++++++++++++++++---------------- crypto/bls_include.h | 4 ++-- crypto/bls_multisig.go | 41 ++++++++++++++++++++++++----------------- crypto/bls_test.go | 7 +++++-- 5 files changed, 55 insertions(+), 39 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 855d61ad7ac..55569075f14 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -113,8 +113,6 @@ prec_st* init_precomputed_data_BLS12_381() { return bls_prec; } -// ------------------- Utilities - // ------------------- Fr utilities // Montgomery constant R related to the curve order r diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 47e7d270546..528aaef7244 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -357,7 +357,7 @@ static void free_tree(node* root) { if (!root) return; // only free pks and sigs of non-leafs, data of leafs are allocated - // as an entire array in `bls_batchVerify`. + // as an entire array in `bls_batch_verify`. if (root->left) { // no need to check the right child for the leaf check because // the recursive build starts with the left side first // relic free @@ -413,7 +413,7 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) { } // verify the binary tree and fill the results using recursive batch verifications. -static void bls_batchVerify_tree(const node* root, const int len, byte* results, +static void bls_batch_verify_tree(const node* root, const int len, byte* results, const byte* data, const int data_len) { // verify the aggregated signature against the aggregated public key. int res = bls_verify_ep(root->pk, root->sig, data, data_len); @@ -436,21 +436,23 @@ static void bls_batchVerify_tree(const node* root, const int len, byte* results, // use the binary tree structure to find the invalid signatures. int right_len = len/2; int left_len = len - right_len; - bls_batchVerify_tree(root->left, left_len, &results[0], data, data_len); - bls_batchVerify_tree(root->right, right_len, &results[left_len], data, data_len); + bls_batch_verify_tree(root->left, left_len, &results[0], data, data_len); + bls_batch_verify_tree(root->right, right_len, &results[left_len], data, data_len); } // Batch verifies the validity of a multiple BLS signatures of the // same message under multiple public keys. Each signature at index `i` is verified // against the public key at index `i`. +// `seed` is used as the entropy source for randoms required by the computation. The function +// assumes the source size is at least (16*sigs_len) of random bytes of entropy at least 128 bits. // // - membership checks of all signatures is verified upfront. // - use random coefficients for signatures and public keys at the same index to prevent // indices mixup. // - optimize the verification by verifying an aggregated signature against an aggregated // public key, and use a recursive verification to find invalid signatures. -void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input, - const byte* sigs_bytes, const byte* data, const int data_len) { +void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, + const byte* sigs_bytes, const byte* data, const int data_len, const byte* seed) { // initialize results to undefined memset(results, UNDEFINED, sigs_len); @@ -464,7 +466,6 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input, ep_new(sigs[i]); ep2_new(pks[i]); } - bn_t r; bn_new(r); for (int i=0; i < sigs_len; i++) { // convert the signature points: @@ -484,14 +485,21 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input, results[i] = INVALID; } else { // choose a random non-zero coefficient of at least 128 bits - // TODO: find a way to generate randoms - bn_rand(r, RLC_POS, SEC_BITS); - bn_add_dig(r, r, 1); - Fr* tmp = Fr_relic_to_blst(r); - // multiply public key and signature by the same random exponent - E2_mult(&pks[i], &pks_input[i], tmp); - free(tmp); - ep_mul_lwnaf(&sigs[i], &sigs[i], r); + Fr r, one; + // r = random, i-th seed is used for i-th signature + Fr_set_zero(&r); + const int seed_len = SEC_BITS/8; + limbs_from_be_bytes((limb_t*)&r, seed + (seed_len*i), seed_len); // faster shortcut than Fr_map_bytes + // r = random + 1 + Fr_set_limb(&one, 1); + Fr_add(&r, &r, &one); + /*char str[20]; sprintf(str, "r-%d", i); + Fr_print_(str, &r);*/ + // multiply public key and signature by the same random exponent r + E2_mult(&pks[i], &pks_input[i], &r); // TODO: faster version for short expos? + bn_st* tmp = Fr_blst_to_relic(&r); + ep_mul_lwnaf(&sigs[i], &sigs[i], tmp); + free(tmp); } } // build a binary tree of aggreagtions @@ -499,7 +507,7 @@ void bls_batchVerify(const int sigs_len, byte* results, const E2* pks_input, if (!root) goto out; // verify the binary tree and fill the results using batch verification - bls_batchVerify_tree(root, sigs_len, &results[0], data, data_len); + bls_batch_verify_tree(root, sigs_len, &results[0], data, data_len); // free the allocated tree free_tree(root); diff --git a/crypto/bls_include.h b/crypto/bls_include.h index f5a6a53a6f7..d0f9120beb2 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -39,7 +39,7 @@ int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const int bls_verifyPerDistinctKey(const byte*, const int, const E2*, const uint32_t*, const byte*, const uint32_t*); -void bls_batchVerify(const int, byte*, const E2*, - const byte*, const byte*, const int); +void bls_batch_verify(const int, byte*, const E2*, + const byte*, const byte*, const int, const byte*); #endif diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index e6589a60031..ffaf8d637ce 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -4,6 +4,7 @@ package crypto import ( + "crypto/rand" "errors" "fmt" @@ -472,27 +473,27 @@ func VerifyBLSSignatureManyMessages( func BatchVerifyBLSSignaturesOneMessage( pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher, ) ([]bool, error) { + // boolean array returned when errors occur + falseSlice := make([]bool, len(sigs)) + // empty list check if len(pks) == 0 { - return []bool{}, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError) + return falseSlice, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError) } if len(pks) != len(sigs) { - return []bool{}, invalidInputsErrorf( + return falseSlice, invalidInputsErrorf( "keys length %d and signatures length %d are mismatching", len(pks), len(sigs)) } - // return boolean array returnBool := make([]bool, len(sigs)) - // temporary boolean array to hold the return values till all the return values are set - tmpBool := make([]bool, len(sigs)) - for i := range tmpBool { - tmpBool[i] = true // default to true + for i := range returnBool { + returnBool[i] = true // default to true } if err := checkBLSHasher(kmac); err != nil { - return returnBool, err + return falseSlice, err } // flatten the shares (required by the C layer) @@ -507,14 +508,14 @@ func BatchVerifyBLSSignaturesOneMessage( for i, pk := range pks { pkBLS, ok := pk.(*pubKeyBLSBLS12381) if !ok { - return returnBool, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) + return falseSlice, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) } if len(sigs[i]) != signatureLengthBLSBLS12381 || pkBLS.isIdentity { // case of invalid signature: set the signature and public key at index `i` // to identities so that there is no effect on the aggregation tree computation. // However, the boolean return for index `i` is set to `false` and won't be overwritten. - tmpBool[i] = false + returnBool[i] = false pkPoints = append(pkPoints, getIdentityPoint()) flatSigs = append(flatSigs, identityBLSSignature...) } else { @@ -525,28 +526,34 @@ func BatchVerifyBLSSignaturesOneMessage( // hash the input to 128 bytes h := kmac.ComputeHash(message) - verifInt := make([]byte, len(returnBool)) + verifInt := make([]byte, len(sigs)) + // internal non-determministic entropy source required by bls_batch_verify + // specific length of the seed is required by bls_batch_verify. + seed := make([]byte, (securityBits/8)*len(verifInt)) + _, err := rand.Read(seed) + if err != nil { + return falseSlice, fmt.Errorf("generating randoms failed: %w", err) + } - C.bls_batchVerify( + C.bls_batch_verify( (C.int)(len(verifInt)), (*C.uchar)(&verifInt[0]), (*C.E2)(&pkPoints[0]), (*C.uchar)(&flatSigs[0]), (*C.uchar)(&h[0]), (C.int)(len(h)), + (*C.uchar)(&seed[0]), ) for i, v := range verifInt { if (C.int)(v) != valid && (C.int)(v) != invalid { - return returnBool, fmt.Errorf("batch verification failed") + return falseSlice, fmt.Errorf("batch verification failed") } - if tmpBool[i] { // only overwrite if not previously written - tmpBool[i] = ((C.int)(v) == valid) + if returnBool[i] { // only overwrite if not previously set to false + returnBool[i] = ((C.int)(v) == valid) } } - // make sure returnBool is []false till this point - copy(returnBool, tmpBool) return returnBool, nil } diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 703ec9784b8..d6c849f2feb 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -725,16 +725,19 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:0], sigs[:0], input, kmac) require.Error(t, err) assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Equal(t, valid, []bool{}, + assert.Equal(t, valid, expectedValid[:0], "verification should fail with empty list key, got %v", valid) }) // test incorrect inputs t.Run("inconsistent inputs", func(t *testing.T) { + for i := 0; i < sigsNum; i++ { + expectedValid[i] = false + } valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:len(pks)-1], sigs, input, kmac) require.Error(t, err) assert.True(t, IsInvalidInputsError(err)) - assert.Equal(t, valid, []bool{}, + assert.Equal(t, valid, expectedValid, "verification should fail with incorrect input lenghts, got %v", valid) }) From 183627b770db6e5ff8afc8fe2fd56cab9800de35 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 20 Apr 2023 18:45:02 -0600 Subject: [PATCH 044/200] minor cleanup --- crypto/bls12381_utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 033bcfcb20f..0ca3e8d48a2 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -112,7 +112,7 @@ func (p *pointE2) isInfinity() bool { func randFr(x *scalar, rand random.Rand) bool { // use extra 128 bits to reduce the modular reduction bias bytes := make([]byte, frBytesLen+securityBits/8) - rand.Read(bytes) // checking one output is enough + rand.Read(bytes) // modular reduction return mapToFr(x, bytes) } From c033600bad1eba99bd261e8218a4306426ce275d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 20 Apr 2023 21:07:31 -0600 Subject: [PATCH 045/200] fix linter --- crypto/bls12381_utils.go | 3 ++- crypto/common.go | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 0ca3e8d48a2..7c46b8a20a3 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -121,7 +121,8 @@ func randFr(x *scalar, rand random.Rand) bool { // and saves the random in `x`. func randFrStar(x *scalar, rand random.Rand) { isZero := true - // exteremely unlikely this loop runs more than once + // exteremely unlikely this loop runs more than once, + // but force the output to be non-zero instead of propagating an error. for isZero { isZero = randFr(x, rand) } diff --git a/crypto/common.go b/crypto/common.go index f476de92e3f..7e460cbf6d2 100644 --- a/crypto/common.go +++ b/crypto/common.go @@ -21,9 +21,6 @@ const ( // it is still recommened that seed is generated using a secure RNG. KeyGenSeedMinLen = 2 * (securityBits / 8) KeyGenSeedMaxLen = 256 - - // max relic PRG seed length in bytes - maxRelicPrgSeed = 1 << 32 ) // TODO: update this code to make sure From 0a2e943e1a7ae10f66627ee7529238347cd38d53 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 24 Apr 2023 17:38:27 -0600 Subject: [PATCH 046/200] add replaces to test using latest crypto package --- go.mod | 2 ++ insecure/go.mod | 2 ++ integration/go.mod | 2 ++ 3 files changed, 6 insertions(+) diff --git a/go.mod b/go.mod index 21a9faa6018..d808194e99f 100644 --- a/go.mod +++ b/go.mod @@ -278,3 +278,5 @@ require ( lukechampine.com/blake3 v1.1.7 // indirect nhooyr.io/websocket v1.8.6 // indirect ) + +replace github.com/onflow/flow-go/crypto => ./crypto diff --git a/insecure/go.mod b/insecure/go.mod index 1c74525425e..a76a0fe92db 100644 --- a/insecure/go.mod +++ b/insecure/go.mod @@ -269,3 +269,5 @@ require ( ) replace github.com/onflow/flow-go => ../ + +replace github.com/onflow/flow-go/crypto => ../crypto diff --git a/integration/go.mod b/integration/go.mod index b1ae92ab43b..0261ce32dd4 100644 --- a/integration/go.mod +++ b/integration/go.mod @@ -325,3 +325,5 @@ require ( replace github.com/onflow/flow-go => ../ replace github.com/onflow/flow-go/insecure => ../insecure + +replace github.com/onflow/flow-go/crypto => ../crypto From 7bd182aacfdec08219649482e7bfba2e028845e7 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 24 Apr 2023 18:08:18 -0600 Subject: [PATCH 047/200] temp update to makefile to setup crypto with replace statement --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b465aad4e31..8719bc21dce 100644 --- a/Makefile +++ b/Makefile @@ -43,9 +43,12 @@ export CONTAINER_REGISTRY := gcr.io/flow-container-registry export DOCKER_BUILDKIT := 1 # setup the crypto package under the GOPATH: needed to test packages importing flow-go/crypto +# TODO: replace by bash crypto_setup.sh after removing replace statements .PHONY: crypto_setup_gopath crypto_setup_gopath: - bash crypto_setup.sh + (cd ./crypto && make setup) + + cmd/collection/collection: go build -o cmd/collection/collection cmd/collection/main.go From 9ae8df2473230b6c389bc1645bb3e668771fcf3e Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 24 Apr 2023 18:19:52 -0600 Subject: [PATCH 048/200] mod tidy --- go.sum | 3 --- insecure/go.mod | 1 + insecure/go.sum | 19 ++++++++++++++++--- integration/go.mod | 1 + integration/go.sum | 4 +--- 5 files changed, 19 insertions(+), 9 deletions(-) diff --git a/go.sum b/go.sum index 79d22d8b924..b5ddfc7ecfd 100644 --- a/go.sum +++ b/go.sum @@ -1236,8 +1236,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.0 h1:XEKE6qJUw3luhsYmIOteXP53gtx github.com/onflow/flow-ft/lib/go/contracts v0.7.0/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU= github.com/onflow/flow-go-sdk v0.40.0 h1:s8uwoyTquN8tjdXpqGmNkXTjf79yUII8JExc5QEl4Xw= github.com/onflow/flow-go-sdk v0.40.0/go.mod h1:34dxXk9Hp/bQw6Zy6+H44Xo0kQU+aJyQoqdDxq00rJM= -github.com/onflow/flow-go/crypto v0.24.7 h1:RCLuB83At4z5wkAyUCF7MYEnPoIIOHghJaODuJyEoW0= -github.com/onflow/flow-go/crypto v0.24.7/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0= github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230330183547-d0dd18f6f20d h1:Wl8bE1YeZEcRNnCpxw2rikOEaivuYKDrnJd2vsfIWoA= github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230330183547-d0dd18f6f20d/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk= github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8 h1:XcSR/n2aSVO7lOEsKScYALcpHlfowLwicZ9yVbL6bnA= @@ -1477,7 +1475,6 @@ github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs= github.com/subosito/gotenv v1.4.0/go.mod h1:mZd6rFysKEcUhUHXJk0C/08wAgyDBFuwEYL7vWWGaGo= -github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA= diff --git a/insecure/go.mod b/insecure/go.mod index a76a0fe92db..dae2503f3b6 100644 --- a/insecure/go.mod +++ b/insecure/go.mod @@ -257,6 +257,7 @@ require ( golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect golang.org/x/tools v0.6.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect + gonum.org/v1/gonum v0.8.2 // indirect google.golang.org/api v0.114.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 // indirect diff --git a/insecure/go.sum b/insecure/go.sum index 598f99e4cdb..d4214a1cbdd 100644 --- a/insecure/go.sum +++ b/insecure/go.sum @@ -85,6 +85,7 @@ github.com/VictoriaMetrics/fastcache v1.5.3/go.mod h1:+jv9Ckb+za/P1ZRg/sulP5Ni1v github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/4+TcAqDqk/vUH7g= github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII= github.com/afex/hystrix-go v0.0.0-20180502004556-fa1af6a1f4f5/go.mod h1:SkGFH1ia65gfNATL8TAiHDNxPzPdmEL5uirI2Uyuz6c= +github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= @@ -304,6 +305,7 @@ github.com/flynn/go-shlex v0.0.0-20150515145356-3f9db97f8568/go.mod h1:xEzjJPgXI github.com/flynn/noise v0.0.0-20180327030543-2492fe189ae6/go.mod h1:1i71OnUq3iUe1ma7Lr6yG6/rjvM3emb6yoL7xLFzcVQ= github.com/flynn/noise v1.0.0 h1:DlTHqmzmvcEiKj+4RYo/imoswx/4r6iBlCMfVtrMXpQ= github.com/flynn/noise v1.0.0/go.mod h1:xbMo+0i6+IGbYdJhF31t2eR1BIU0CYc12+BNAKwUTag= +github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/francoispqt/gojay v1.2.13 h1:d2m3sFjloqoIUQU3TsHBgj6qg/BVGlTBeHDUmyJnXKk= github.com/francoispqt/gojay v1.2.13/go.mod h1:ehT5mTG4ua4581f1++1WLG0vPdaA9HaiDsoyrBGkyDY= github.com/franela/goblin v0.0.0-20200105215937-c9ffbefa60db/go.mod h1:7dvUGVsVBjqR7JHJk0brhHOZYGmfBYOrK0ZhYMEtBr4= @@ -391,6 +393,7 @@ github.com/gogo/protobuf v1.3.0/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXP github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/glog v1.0.0 h1:nfP3RFugxnNRyKgeWd4oI1nYvXpxrx8ck8ZrcizshdQ= github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= @@ -723,6 +726,7 @@ github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfV github.com/julienschmidt/httprouter v1.1.1-0.20170430222011-975b5c4c7c21/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM= +github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= github.com/kami-zh/go-capturer v0.0.0-20171211120116-e492ea43421d/go.mod h1:P2viExyCEfeWGU259JnaQ34Inuec4R38JCyBx2edgD0= github.com/karalabe/usb v0.0.0-20190919080040-51dc0efba356/go.mod h1:Od972xHfMJowv7NGVDiWVxk2zxnWgjLlJzE+F4F7AGU= @@ -1184,8 +1188,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.0 h1:XEKE6qJUw3luhsYmIOteXP53gtx github.com/onflow/flow-ft/lib/go/contracts v0.7.0/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU= github.com/onflow/flow-go-sdk v0.40.0 h1:s8uwoyTquN8tjdXpqGmNkXTjf79yUII8JExc5QEl4Xw= github.com/onflow/flow-go-sdk v0.40.0/go.mod h1:34dxXk9Hp/bQw6Zy6+H44Xo0kQU+aJyQoqdDxq00rJM= -github.com/onflow/flow-go/crypto v0.24.7 h1:RCLuB83At4z5wkAyUCF7MYEnPoIIOHghJaODuJyEoW0= -github.com/onflow/flow-go/crypto v0.24.7/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0= github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230330183547-d0dd18f6f20d h1:Wl8bE1YeZEcRNnCpxw2rikOEaivuYKDrnJd2vsfIWoA= github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230330183547-d0dd18f6f20d/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk= github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8 h1:XcSR/n2aSVO7lOEsKScYALcpHlfowLwicZ9yVbL6bnA= @@ -1423,7 +1425,6 @@ github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs= github.com/subosito/gotenv v1.4.0/go.mod h1:mZd6rFysKEcUhUHXJk0C/08wAgyDBFuwEYL7vWWGaGo= -github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA= @@ -1590,7 +1591,10 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.4.0 h1:UVQgzMY87xqpKNgb+kDsll2Igd33HszWHFLmpaRMq/8= golang.org/x/crypto v0.4.0/go.mod h1:3quD/ATkf6oY+rnes5c3ExXTbLc8mueNue5/DoinL80= +golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= @@ -1603,6 +1607,7 @@ golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMk golang.org/x/exp v0.0.0-20200331195152-e8c3332aa8e5/go.mod h1:4M0jN8W1tt0AVLNr8HDosyJCDCDuyL9N9+3m7wDWgKw= golang.org/x/exp v0.0.0-20221217163422-3c43f8badb15 h1:5oN1Pz/eDhCpbMbLstvIPa0b/BEQo6g6nwV3pLjfM6w= golang.org/x/exp v0.0.0-20221217163422-3c43f8badb15/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= +golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -1833,12 +1838,14 @@ golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac h1:7zkz7BUtwNFFqcowJ+RIgu2MaV/MapERkDIy+mwPyjs= golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180828015842-6cd1fcedba52/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181030000716-a0a13e073c7b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181130052023-1c3d964395ce/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= @@ -1904,7 +1911,12 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= +gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM= +gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= +gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= google.golang.org/api v0.0.0-20180910000450-7ca32eb868bf/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0= google.golang.org/api v0.0.0-20181030000543-1d582fd0359e/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0= google.golang.org/api v0.1.0/go.mod h1:UGEZY7KEX120AnNLIHFMKIo4obdJhkp2tPbaPlQx13Y= @@ -2090,6 +2102,7 @@ nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k= nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0= pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= diff --git a/integration/go.mod b/integration/go.mod index 0261ce32dd4..6487fe8f906 100644 --- a/integration/go.mod +++ b/integration/go.mod @@ -307,6 +307,7 @@ require ( golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect golang.org/x/tools v0.6.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect + gonum.org/v1/gonum v0.11.0 // indirect google.golang.org/api v0.114.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20230306155012-7f2fa6fef1f4 // indirect diff --git a/integration/go.sum b/integration/go.sum index 35c6fbd3bef..4870a501c95 100644 --- a/integration/go.sum +++ b/integration/go.sum @@ -1316,8 +1316,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.0 h1:XEKE6qJUw3luhsYmIOteXP53gtx github.com/onflow/flow-ft/lib/go/contracts v0.7.0/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU= github.com/onflow/flow-go-sdk v0.40.0 h1:s8uwoyTquN8tjdXpqGmNkXTjf79yUII8JExc5QEl4Xw= github.com/onflow/flow-go-sdk v0.40.0/go.mod h1:34dxXk9Hp/bQw6Zy6+H44Xo0kQU+aJyQoqdDxq00rJM= -github.com/onflow/flow-go/crypto v0.24.7 h1:RCLuB83At4z5wkAyUCF7MYEnPoIIOHghJaODuJyEoW0= -github.com/onflow/flow-go/crypto v0.24.7/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0= github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230407005012-727d541fd5f8 h1:O8uM6GVVMhRwBtYaGl93+tDSu6vWqUc47b12fPkZGXk= github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230407005012-727d541fd5f8/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk= github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8 h1:XcSR/n2aSVO7lOEsKScYALcpHlfowLwicZ9yVbL6bnA= @@ -1601,7 +1599,6 @@ github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs= github.com/subosito/gotenv v1.4.0/go.mod h1:mZd6rFysKEcUhUHXJk0C/08wAgyDBFuwEYL7vWWGaGo= -github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk= github.com/syndtr/gocapability v0.0.0-20170704070218-db04d3cc01c8/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/syndtr/goleveldb v1.0.1-0.20200815110645-5c35d600f0ca/go.mod h1:u2MKkTVTVJWe5D1rCvame8WqhBd88EuIwODJZ1VHCPM= @@ -2141,6 +2138,7 @@ gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJ gonum.org/v1/gonum v0.0.0-20181121035319-3f7ecaa7e8ca/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= gonum.org/v1/gonum v0.6.0/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU= gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E= +gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= gonum.org/v1/netlib v0.0.0-20181029234149-ec6d1f5cefe6/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= From 2fb816c18eaecc6c684ec5016fa7237a1ae042a6 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 25 Apr 2023 15:07:11 -0600 Subject: [PATCH 049/200] enable membership check in G2 to fix FVM test --- crypto/bls_core.c | 4 +++- crypto/bls_include.h | 2 +- crypto/dkg_core.c | 2 +- fvm/crypto/crypto_test.go | 15 +++++++-------- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 097e1595d44..e29d3401d69 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -47,7 +47,9 @@ int G2_check_membership(const E2* p){ return INVALID; // check p is in G2 #if MEMBERSHIP_CHECK_G2 == EXP_ORDER - return G2_simple_subgroup_check(p); + // TODO: clean up + ep2_st* tmp = E2_blst_to_relic(p); + return G2_simple_subgroup_check(tmp); #elif MEMBERSHIP_CHECK_G2 == BOWE // TODO: implement Bowe's check return UNDEFINED; diff --git a/crypto/bls_include.h b/crypto/bls_include.h index f81f2839bcf..7a2572a2fc4 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -21,7 +21,7 @@ #define SINGLE_PAIRING (DOUBLE_PAIRING^1) // Signature and public key membership check -#define MEMBERSHIP_CHECK 0 // TODO: switch to 1 and clean up memb check +#define MEMBERSHIP_CHECK 1 // algorithm choice for hashing to G1 // both methods are similar implementations of the same optimized SSWU diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index aedf5d83164..d5f39976090 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -79,7 +79,7 @@ BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){ return read_ret; p += G2_SER_BYTES; } - // TODO: add G2 subgroup check + // TODO: add G2 subgroup check? return BLST_SUCCESS; } diff --git a/fvm/crypto/crypto_test.go b/fvm/crypto/crypto_test.go index fe6c400c1b4..ffbdec3a730 100644 --- a/fvm/crypto/crypto_test.go +++ b/fvm/crypto/crypto_test.go @@ -425,16 +425,13 @@ func TestVerifySignatureFromTransaction(t *testing.T) { func TestValidatePublicKey(t *testing.T) { - // make sure the seed length is larger than miniumum seed lengths of all signature schemes - seedLength := 64 - validPublicKey := func(t *testing.T, s runtime.SignatureAlgorithm) []byte { - seed := make([]byte, seedLength) + seed := make([]byte, gocrypto.KeyGenSeedMinLen) _, err := rand.Read(seed) require.NoError(t, err) - pk, err := gocrypto.GeneratePrivateKey(crypto.RuntimeToCryptoSigningAlgorithm(s), seed) + sk, err := gocrypto.GeneratePrivateKey(crypto.RuntimeToCryptoSigningAlgorithm(s), seed) require.NoError(t, err) - return pk.PublicKey().Encode() + return sk.PublicKey().Encode() } t.Run("Unknown algorithm should return false", func(t *testing.T) { @@ -463,12 +460,14 @@ func TestValidatePublicKey(t *testing.T) { runtime.SignatureAlgorithmBLS_BLS12_381, } for i, s := range signatureAlgos { + t.Run(fmt.Sprintf("case %v: %v", i, s), func(t *testing.T) { key := validPublicKey(t, s) + // This may cause flakiness depending on the public key + // deserialization scheme used!! key[0] ^= 1 // alter one bit of the valid key - err := crypto.ValidatePublicKey(s, key) - require.Error(t, err) + require.Errorf(t, err, "key is %#x", key) }) } }) From 54b92e63c3f2bde9407a6b0fb936ec0697b33dbd Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 25 Apr 2023 19:02:10 -0600 Subject: [PATCH 050/200] fix E2 infinity set and check to be based on projective Z --- crypto/bls.go | 2 +- crypto/bls12381_utils.c | 6 ++++-- crypto/bls12381_utils.go | 4 +--- crypto/bls_core.c | 4 ++-- crypto/bls_test.go | 41 ++++++++++++++++++++++++++++++++++++---- 5 files changed, 45 insertions(+), 12 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index d45ea7f3aeb..a2d372aca25 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -227,7 +227,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) } // 0xC0 is the header of the point at infinity serialization (either in G1 or G2) -const infinityPointHeader = 0xC0 +const infinityPointHeader = byte(0xC0) var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, signatureLengthBLSBLS12381-1)...) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 9b91e8e0ebd..d722531ec65 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -950,12 +950,14 @@ void E2_write_bytes(byte *bin, const E2* a) { // set p to infinity void E2_set_infty(E2* p) { - vec_zero(p, sizeof(E2)); + // BLST infinity points are defined by Z=0 + vec_zero(p->z, sizeof(p->z)); } // check if `p` is infinity bool_t E2_is_infty(const E2* p) { - return vec_is_zero(p, sizeof(E2)); + // BLST infinity points are defined by Z=0 + return vec_is_zero(p->z, sizeof(p->z)); } // checks affine point `p` is in E2 diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 59776fcec5b..56c63700753 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -166,9 +166,7 @@ func writeScalar(dest []byte, x *scalar) { // The slice should be of size PubKeyLenBLSBLS12381 and the serialization // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves func writePointG2(dest []byte, a *pointE2) { - C.E2_write_bytes((*C.uchar)(&dest[0]), - (*C.E2)(a), - ) + C.E2_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(a)) } // writePointG1 writes a G1 point in a slice of bytes diff --git a/crypto/bls_core.c b/crypto/bls_core.c index e29d3401d69..d92b4e992e6 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -86,7 +86,7 @@ void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { // and a message data. // The signature and public key are assumed to be in G1 and G2 respectively. This // function only checks the pairing equality. -static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int len) { +static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int len) { ep_t elemsG1[2]; ep2_t elemsG2[2]; @@ -137,7 +137,7 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int goto out; } } - + out: ep_free(elemsG1[0]); ep_free(elemsG1[1]); diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 260c9295994..e0fb9f29460 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -186,13 +186,16 @@ func TestBLSEncodeDecode(t *testing.T) { assert.True(t, IsInvalidInputsError(err)) assert.Nil(t, sk) - // identity public key + // decode an identity public key pkBytes := make([]byte, PubKeyLenBLSBLS12381) pkBytes[0] = infinityPointHeader pk, err := DecodePublicKey(BLSBLS12381, pkBytes) require.NoError(t, err, "decoding identity public key should succeed") assert.True(t, pk.Equals(IdentityBLSPublicKey())) + // encode an identity public key + assert.Equal(t, pk.Encode(), pkBytes) + // invalid point pkBytes = make([]byte, PubKeyLenBLSBLS12381) pkBytes[0] = invalidBLSSignatureHeader @@ -436,7 +439,7 @@ func TestBLSAggregateSignatures(t *testing.T) { // Aggregate n public keys and their respective private keys and compare // the public key of the aggregated private key is equal to the aggregated // public key -func TestBLSAggregatePubKeys(t *testing.T) { +func TestBLSAggregatePublicKeys(t *testing.T) { rand := getPRG(t) // number of keys to aggregate pkNum := rand.Intn(100) + 1 @@ -507,8 +510,8 @@ func TestBLSAggregatePubKeys(t *testing.T) { // check that the public key corresponding to the zero private key is indeed identity // The package doesn't allow to generate a zero private key. One way to obtain a zero - // private key is via aggrgeting opposite private keys - t.Run("public key of zero private key", func(t *testing.T) { + // private key is via aggregating opposite private keys + t.Run("Identity public key from identity private key", func(t *testing.T) { // sk1 is group order of bls12-381 minus one groupOrderMinus1 := []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39, 0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE, @@ -520,9 +523,39 @@ func TestBLSAggregatePubKeys(t *testing.T) { one[PrKeyLenBLSBLS12381-1] = 1 sk2, err := DecodePrivateKey(BLSBLS12381, one) require.NoError(t, err) + // public key of aggregated private keys aggSK, err := AggregateBLSPrivateKeys([]PrivateKey{sk1, sk2}) require.NoError(t, err) assert.True(t, aggSK.PublicKey().Equals(IdentityBLSPublicKey())) + // aggregated public keys + aggPK, err := AggregateBLSPublicKeys([]PublicKey{sk1.PublicKey(), sk2.PublicKey()}) + require.NoError(t, err) + assert.True(t, aggPK.Equals(IdentityBLSPublicKey())) + // check of internal identity flag + blsKey, ok := aggPK.(*pubKeyBLSBLS12381) + require.True(t, ok) + assert.True(t, blsKey.isIdentity) + // check of encoding header + pkBytes := aggPK.Encode() + assert.Equal(t, infinityPointHeader, pkBytes[0]) + }) + + t.Run("Identity public key from opposite points", func(t *testing.T) { + pkBytes := pks[0].Encode() + negatePoint(pkBytes) + minusPk, err := DecodePublicKey(BLSBLS12381, pkBytes) + require.NoError(t, err) + // aggregated public keys + aggPK, err := AggregateBLSPublicKeys([]PublicKey{pks[0], minusPk}) + require.NoError(t, err) + assert.True(t, aggPK.Equals(IdentityBLSPublicKey())) + // check of internal identity flag + blsKey, ok := aggPK.(*pubKeyBLSBLS12381) + require.True(t, ok) + assert.True(t, blsKey.isIdentity) + // check of encoding header + pkBytes = aggPK.Encode() + assert.Equal(t, infinityPointHeader, pkBytes[0]) }) } From f387ce90839937773dfd550f4638a0dd332602e4 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 26 Apr 2023 14:07:41 -0600 Subject: [PATCH 051/200] fix warning --- crypto/bls12381_utils.c | 26 +++++++++++++------------- crypto/bls12381_utils.go | 2 +- crypto/bls12381_utils.h | 10 +++++----- crypto/dkg_test.go | 15 ++++++++------- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index d722531ec65..64efee4bcfc 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -154,17 +154,17 @@ Fr* Fr_relic_to_blst(const bn_st* x){ // returns true if a == 0 and false otherwise bool_t Fr_is_zero(const Fr* a) { - return bytes_are_zero((const byte*)a, Fr_BYTES); + return bytes_are_zero((const byte*)a, sizeof(Fr)); } // returns true if a == b and false otherwise bool_t Fr_is_equal(const Fr* a, const Fr* b) { - return vec_is_equal(a, b, Fr_BYTES); + return vec_is_equal(a, b, sizeof(Fr)); } // sets `a` to limb `l` void Fr_set_limb(Fr* a, const limb_t l){ - vec_zero((byte*)a + sizeof(limb_t), Fr_BYTES - sizeof(limb_t)); + vec_zero((byte*)a + sizeof(limb_t), sizeof(Fr) - sizeof(limb_t)); *((limb_t*)a) = l; } @@ -304,7 +304,7 @@ static void pow256_from_Fr(pow256 ret, const Fr* in) { // - BLST_BAD_ENCODING if the length is invalid // - BLST_BAD_SCALAR if the scalar isn't in Fr // - BLST_SUCCESS if the scalar is valid -BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) { +BLST_ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) { if (len != Fr_BYTES) { return BLST_BAD_ENCODING; } @@ -325,7 +325,7 @@ BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len) { // - BLST_BAD_ENCODING if the length is invalid // - BLST_BAD_SCALAR if the scalar isn't in Fr_star // - BLST_SUCCESS if the scalar is valid -BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) { +BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) { int ret = Fr_read_bytes(a, bin, len); if (ret != BLST_SUCCESS) { return ret; @@ -338,28 +338,28 @@ BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len) { } // write Fr element `a` in big endian bytes. -void Fr_write_bytes(uint8_t *bin, const Fr* a) { +void Fr_write_bytes(byte *bin, const Fr* a) { be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES); } // maps big-endian bytes into an Fr element using modular reduction // Input is byte-big-endian, output is vec256 (also used as Fr) -static void vec256_from_be_bytes(Fr* out, const unsigned char *bytes, size_t n) +static void vec256_from_be_bytes(Fr* out, const byte *bytes, size_t n) { Fr digit, radix; Fr_set_zero(out); Fr_copy(&radix, (Fr*)BLS12_381_rRR); // R^2 - bytes += n; + byte* p = bytes + n; while (n > Fr_BYTES) { - limbs_from_be_bytes((limb_t*)&digit, bytes -= Fr_BYTES, Fr_BYTES); // l_i + limbs_from_be_bytes((limb_t*)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i (i is the loop number starting at 1) Fr_add(out, out, &digit); Fr_mul_montg(&radix, &radix, (Fr*)BLS12_381_rRR); // R^(i+1) n -= Fr_BYTES; } Fr_set_zero(&digit); - limbs_from_be_bytes((limb_t*)&digit, bytes -= n, n); + limbs_from_be_bytes((limb_t*)&digit, p - n, n); Fr_mul_montg(&digit, &digit, &radix); Fr_add(out, out, &digit); // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n @@ -504,8 +504,8 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) { // returns the sign of y. // 1 if y > (p - 1)/2 and 0 otherwise. // y is in montgomery form -static byte Fp_get_sign(const fp_t y) { - return sgn0_pty_mont_384(y, BLS12_381_P, p0); +static byte Fp_get_sign(const Fp* y) { + return sgn0_pty_mont_384((const limb_t*)y, BLS12_381_P, p0); } // ------------------- Fp^2 utilities @@ -1303,7 +1303,7 @@ void ep2_rand_G2complement(ep2_t p) { // This is a testing function. // It wraps a call to a Relic macro since cgo can't call macros. -void xmd_sha256(uint8_t *hash, int len_hash, uint8_t *msg, int len_msg, uint8_t *dst, int len_dst){ +void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){ md_xmd_sh256(hash, len_hash, msg, len_msg, dst, len_dst); } diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 56c63700753..2c5da2495f4 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -7,7 +7,7 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros +// #cgo CFLAGS: -fsanitize=thread -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 01f68610603..ecdc0ada0fe 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -113,10 +113,10 @@ void Fr_from_montg(Fr *res, const Fr *a); void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len); void Fr_inv_montg_eucl(Fr *res, const Fr *a); void Fr_inv_exp_montg(Fr *res, const Fr *a); -BLST_ERROR Fr_read_bytes(Fr* a, const uint8_t *bin, int len); -BLST_ERROR Fr_star_read_bytes(Fr* a, const uint8_t *bin, int len); -void Fr_write_bytes(uint8_t *bin, const Fr* a); -bool map_bytes_to_Fr(Fr*, const uint8_t*, int); +BLST_ERROR Fr_read_bytes(Fr* a, const byte *bin, int len); +BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len); +void Fr_write_bytes(byte *bin, const Fr* a); +bool map_bytes_to_Fr(Fr*, const byte*, int); // Fp utilities @@ -166,7 +166,7 @@ void precomputed_data_set(const prec_st* p); void seed_relic(byte*, int); // utility testing function -void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int); +void xmd_sha256(byte *, int, byte *, int, byte *, int); // Debugging related functions void bytes_print_(char*, byte*, int); diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index 0329eb453ea..b2d55e6bf18 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -104,17 +104,18 @@ func testJointFeldman(t *testing.T) { n := 4 var threshold int // happy path, test multiple values of thresold - for threshold = MinimumThreshold; threshold < n; threshold++ { - t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, jointFeldman, n, threshold, happyPath) - }) - } + //for threshold = MinimumThreshold; threshold < n; threshold++ { + threshold = optimalThreshold(n) + t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { + dkgCommonTest(t, jointFeldman, n, threshold, happyPath) + }) + //} // unhappy path, with focus on the optimal threshold value n = 5 threshold = optimalThreshold(n) // unhappy path, with invalid shares - t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { + /*t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { dkgCommonTest(t, jointFeldman, n, threshold, invalidShares) }) // unhappy path, with invalid vector @@ -132,7 +133,7 @@ func testJointFeldman(t *testing.T) { // unhappy path, with duplicated messages (all types) t.Run(fmt.Sprintf("JointFeldman_DuplicatedMessages_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { dkgCommonTest(t, jointFeldman, n, threshold, duplicatedMessages) - }) + })*/ } // Supported Key Generation protocols From cbe51a372af1605e699b62dfdd10d1b0a67069d1 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 26 Apr 2023 14:14:58 -0600 Subject: [PATCH 052/200] disable thread SAN --- crypto/bls12381_utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 2c5da2495f4..56c63700753 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -7,7 +7,7 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -fsanitize=thread -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ From be727d855fb5e28d94821e1a0cc9aca2e704c358 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 26 Apr 2023 14:41:25 -0600 Subject: [PATCH 053/200] add SIGILL handler --- crypto/bls12381_utils.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 56c63700753..be5991fb0e9 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -12,6 +12,25 @@ package crypto // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls12381_utils.h" +// +// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) +// # include +// # include +// static void handler(int signum) +// { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " +// "consult /bindings/go/README.md.\n", 70); +// _exit(128+SIGILL); +// (void)n; +// } +// __attribute__((constructor)) static void blst_cgo_init() +// { blst_fp temp = { 0 }; +// struct sigaction act = { handler }, oact; +// sigaction(SIGILL, &act, &oact); +// blst_fp_sqr(&temp, &temp); +// sigaction(SIGILL, &oact, NULL); +// } +// #endif +// import "C" import ( "crypto/rand" From 6bd85a79a83393fe77954c2e5ded7bd8697b6f68 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 26 Apr 2023 16:37:03 -0600 Subject: [PATCH 054/200] fix blst_cgo_init --- crypto/bls12381_utils.c | 2 +- crypto/bls12381_utils.go | 6 +++--- crypto/bls12381_utils.h | 2 ++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 64efee4bcfc..b9ec974fee3 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -350,7 +350,7 @@ static void vec256_from_be_bytes(Fr* out, const byte *bytes, size_t n) Fr_set_zero(out); Fr_copy(&radix, (Fr*)BLS12_381_rRR); // R^2 - byte* p = bytes + n; + byte* p = (byte*)bytes + n; while (n > Fr_BYTES) { limbs_from_be_bytes((limb_t*)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i (i is the loop number starting at 1) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index be5991fb0e9..52a0dde0248 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -23,10 +23,10 @@ package crypto // (void)n; // } // __attribute__((constructor)) static void blst_cgo_init() -// { blst_fp temp = { 0 }; -// struct sigaction act = { handler }, oact; +// { Fp temp = { 0 }; +// struct sigaction act = {{ handler }}, oact; // sigaction(SIGILL, &act, &oact); -// blst_fp_sqr(&temp, &temp); +// Fp_squ_montg(&temp, &temp); // sigaction(SIGILL, &oact, NULL); // } // #endif diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index ecdc0ada0fe..ca69b584201 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -119,6 +119,8 @@ void Fr_write_bytes(byte *bin, const Fr* a); bool map_bytes_to_Fr(Fr*, const byte*, int); // Fp utilities +void Fp_mul_montg(Fp *, const Fp *, const Fp *); +void Fp_squ_montg(Fp *, const Fp *); // E1 and G1 utilities int ep_read_bin_compact(ep_t, const byte *, const int); From 76fcc73fcb6a0d58b55012edce7aedf28aaa395c Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 26 Apr 2023 21:01:39 -0600 Subject: [PATCH 055/200] disable ADX instructions in BlST by default as a temp measure --- crypto/Makefile | 2 +- crypto/bls12381_utils.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crypto/Makefile b/crypto/Makefile index c66774e1033..d87f27c440f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -22,7 +22,7 @@ relic_tests: ifeq ($(ADX_SUPPORT), 1) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,) else - CGO_CFLAGS="-D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,) + CGO_CFLAGS="-O -D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,) endif # test all packages that do not require Relic library (all functionalities except the BLS-related ones) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 52a0dde0248..38e012a1510 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -7,7 +7,7 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -O -D__BLST_PORTABLE__ -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros // #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ @@ -17,12 +17,12 @@ package crypto // # include // # include // static void handler(int signum) -// { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " -// "consult /bindings/go/README.md.\n", 70); +// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=-O -D__BLST_PORTABLE__"; +// ssize_t n = write(2, &text, strlen(text)); // _exit(128+SIGILL); // (void)n; // } -// __attribute__((constructor)) static void blst_cgo_init() +// __attribute__((constructor)) static void flow_crypto_cgo_init() // { Fp temp = { 0 }; // struct sigaction act = {{ handler }}, oact; // sigaction(SIGILL, &act, &oact); From 2a851b5551cb775af5b2db0a2c19dad7d72c8c1e Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 28 Apr 2023 14:45:43 -0600 Subject: [PATCH 056/200] uncomment DKG tests --- crypto/dkg_test.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index b2d55e6bf18..0329eb453ea 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -104,18 +104,17 @@ func testJointFeldman(t *testing.T) { n := 4 var threshold int // happy path, test multiple values of thresold - //for threshold = MinimumThreshold; threshold < n; threshold++ { - threshold = optimalThreshold(n) - t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, jointFeldman, n, threshold, happyPath) - }) - //} + for threshold = MinimumThreshold; threshold < n; threshold++ { + t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { + dkgCommonTest(t, jointFeldman, n, threshold, happyPath) + }) + } // unhappy path, with focus on the optimal threshold value n = 5 threshold = optimalThreshold(n) // unhappy path, with invalid shares - /*t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { + t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { dkgCommonTest(t, jointFeldman, n, threshold, invalidShares) }) // unhappy path, with invalid vector @@ -133,7 +132,7 @@ func testJointFeldman(t *testing.T) { // unhappy path, with duplicated messages (all types) t.Run(fmt.Sprintf("JointFeldman_DuplicatedMessages_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { dkgCommonTest(t, jointFeldman, n, threshold, duplicatedMessages) - })*/ + }) } // Supported Key Generation protocols From 26e56364f9a929a90879d9dc782916b6f1e4b12d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 28 Apr 2023 20:09:24 -0600 Subject: [PATCH 057/200] more logging in FeldmanVSSQ when shares aren't matching computed keys from verif vector --- crypto/bls.go | 4 ++-- crypto/bls12381_utils.go | 13 +++++++++++++ crypto/dkg_feldmanvssq.go | 24 ++++++++++++++++-------- crypto/dkg_jointfeldman.go | 2 +- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index a2d372aca25..c3a413b6443 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -444,7 +444,7 @@ func (sk *prKeyBLSBLS12381) Equals(other PrivateKey) bool { // String returns the hex string representation of the key. func (sk *prKeyBLSBLS12381) String() string { - return fmt.Sprintf("%#x", sk.Encode()) + return sk.scalar.String() } // pubKeyBLSBLS12381 is the public key of BLS using BLS12_381, @@ -520,7 +520,7 @@ func (pk *pubKeyBLSBLS12381) Equals(other PublicKey) bool { // String returns the hex string representation of the key. func (pk *pubKeyBLSBLS12381) String() string { - return fmt.Sprintf("%#x", pk.Encode()) + return pk.point.String() } // Get Macro definitions from the C layer as Cgo does not export macros diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 38e012a1510..735e1ffc00e 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -35,6 +35,7 @@ import "C" import ( "crypto/rand" "errors" + "fmt" ) // Go wrappers around BLST C types @@ -69,6 +70,18 @@ var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING) var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR) var blst_point_not_on_curve = (int)(C.BLST_POINT_NOT_ON_CURVE) +func (a *scalar) String() string { + encoding := make([]byte, frBytesLen) + writeScalar(encoding, a) + return fmt.Sprintf("%#x", encoding) +} + +func (p *pointE2) String() string { + encoding := make([]byte, pubKeyLengthBLSBLS12381) + writePointG2(encoding, p) + return fmt.Sprintf("%#x", encoding) +} + // initContext sets relic B12_381 parameters and precomputes some data in the C layer func (ct *ctx) initContext() error { c := C.relic_init_BLS12_381() diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 38b3667ffae..5a10a210949 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -162,7 +162,7 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error) if c.received && !c.answerReceived { s.disqualified = true s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("complaint from %d was not answered", + fmt.Sprintf("complaint from (%d) was not answered", complainer)) break } @@ -412,7 +412,7 @@ func (s *feldmanVSSQualState) receiveShare(origin index, data []byte) { if s.vAReceived { if !s.verifyShare() { - // otherwise, build a complaint + // build a complaint s.buildAndBroadcastComplaint() } } @@ -465,8 +465,8 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) { if s.checkComplaint(complainer, c) { s.disqualified = true s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("verification vector received: a complaint answer to %d is invalid", - complainer)) + fmt.Sprintf("verification vector received: a complaint answer to (%d) is invalid, answer is %s, computed key is %s", + complainer, &c.answer, &s.y[complainer])) return } } @@ -482,6 +482,14 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) { // build a complaint against the dealer, add it to the local // complaint map and broadcast it func (s *feldmanVSSQualState) buildAndBroadcastComplaint() { + var logMsg string + if s.vAReceived && s.xReceived { + logMsg = fmt.Sprintf("building a complaint, share is %s, computed public key is %s", + &s.x, &s.y[s.myIndex]) + } else { + logMsg = "building a complaint" + } + s.processor.FlagMisbehavior(int(s.dealerIndex), logMsg) s.complaints[s.myIndex] = &complaint{ received: true, answerReceived: false, @@ -582,8 +590,8 @@ func (s *feldmanVSSQualState) receiveComplaint(origin index, data []byte) { s.disqualified = s.checkComplaint(origin, c) if s.disqualified { s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("complaint received: complaint answer to %d is invalid", - origin)) + fmt.Sprintf("complaint received: answer to (%d) is invalid, answer is %s, computed public key is %s", + origin, &c.answer, &s.y[origin])) } return } @@ -656,8 +664,8 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) s.disqualified = s.checkComplaint(complainer, c) if s.disqualified { s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("complaint answer received: complaint answer to %d is invalid", - complainer)) + fmt.Sprintf("complaint answer received: answer to (%d) is invalid, answer is %s, computed key is %s", + complainer, &c.answer, &s.y[complainer])) } } diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index b15c421dde6..8de9695a0c5 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -194,7 +194,7 @@ func (s *JointFeldmanState) End() (PrivateKey, PublicKey, []PublicKey, error) { if disqualifiedTotal > s.threshold || s.size-disqualifiedTotal <= s.threshold { return nil, nil, nil, dkgFailureErrorf( - "Joint-Feldman failed because the diqualified participants number is high: %d disqualified, threshold is %d, size is %d", + "Joint-Feldman failed because the disqualified participants number is high: %d disqualified, threshold is %d, size is %d", disqualifiedTotal, s.threshold, s.size) } From 5e01e46951cd2e2967f78ccd8e1fc7395764185f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 1 May 2023 10:57:47 -0600 Subject: [PATCH 058/200] use new KeyGenSeedMinLen --- cmd/bootstrap/cmd/dkg.go | 2 +- cmd/bootstrap/dkg/dkg_test.go | 2 +- cmd/bootstrap/run/qc_test.go | 2 +- consensus/hotstuff/signature/randombeacon_inspector_test.go | 2 +- engine/consensus/dkg/reactor_engine.go | 2 +- integration/testnet/util.go | 2 +- module/dkg/controller_test.go | 2 +- state/protocol/badger/validity_test.go | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/bootstrap/cmd/dkg.go b/cmd/bootstrap/cmd/dkg.go index b190b1a7c2c..de7f01bc6bd 100644 --- a/cmd/bootstrap/cmd/dkg.go +++ b/cmd/bootstrap/cmd/dkg.go @@ -22,7 +22,7 @@ func runDKG(nodes []model.NodeInfo) dkg.DKGData { if flagFastKG { dkgData, err = bootstrapDKG.RunFastKG(n, flagBootstrapRandomSeed) } else { - dkgData, err = bootstrapDKG.RunDKG(n, GenerateRandomSeeds(n, crypto.SeedMinLenDKG)) + dkgData, err = bootstrapDKG.RunDKG(n, GenerateRandomSeeds(n, crypto.KeyGenSeedMinLen)) } if err != nil { log.Fatal().Err(err).Msg("error running DKG") diff --git a/cmd/bootstrap/dkg/dkg_test.go b/cmd/bootstrap/dkg/dkg_test.go index 9835cdca538..73fb185ec33 100644 --- a/cmd/bootstrap/dkg/dkg_test.go +++ b/cmd/bootstrap/dkg/dkg_test.go @@ -10,7 +10,7 @@ import ( ) func TestRunDKG(t *testing.T) { - seedLen := crypto.SeedMinLenDKG + seedLen := crypto.KeyGenSeedMinLen _, err := RunDKG(0, unittest.SeedFixtures(2, seedLen)) require.EqualError(t, err, "n needs to match the number of seeds (0 != 2)") diff --git a/cmd/bootstrap/run/qc_test.go b/cmd/bootstrap/run/qc_test.go index 5deed36d1ed..4f925a5e793 100644 --- a/cmd/bootstrap/run/qc_test.go +++ b/cmd/bootstrap/run/qc_test.go @@ -50,7 +50,7 @@ func createSignerData(t *testing.T, n int) *ParticipantData { networkingKeys := unittest.NetworkingKeys(n) stakingKeys := unittest.StakingKeys(n) - seed := make([]byte, crypto.SeedMinLenDKG) + seed := make([]byte, crypto.KeyGenSeedMinLen) _, err := rand.Read(seed) require.NoError(t, err) randomBSKs, randomBPKs, groupKey, err := crypto.BLSThresholdKeyGen(n, diff --git a/consensus/hotstuff/signature/randombeacon_inspector_test.go b/consensus/hotstuff/signature/randombeacon_inspector_test.go index 5784577f668..5e2a08e7c91 100644 --- a/consensus/hotstuff/signature/randombeacon_inspector_test.go +++ b/consensus/hotstuff/signature/randombeacon_inspector_test.go @@ -40,7 +40,7 @@ func (rs *randomBeaconSuite) SetupTest() { // generate threshold keys mrand.Seed(time.Now().UnixNano()) - seed := make([]byte, crypto.SeedMinLenDKG) + seed := make([]byte, crypto.KeyGenSeedMinLen) _, err := mrand.Read(seed) require.NoError(rs.T(), err) rs.skShares, rs.pkShares, rs.pkGroup, err = crypto.BLSThresholdKeyGen(rs.n, rs.threshold, seed) diff --git a/engine/consensus/dkg/reactor_engine.go b/engine/consensus/dkg/reactor_engine.go index 1704483ef48..1d23344e4c6 100644 --- a/engine/consensus/dkg/reactor_engine.go +++ b/engine/consensus/dkg/reactor_engine.go @@ -348,7 +348,7 @@ func (e *ReactorEngine) getDKGInfo(firstBlockID flow.Identifier) (*dkgInfo, erro if err != nil { return nil, fmt.Errorf("could not retrieve epoch dkg final views: %w", err) } - seed := make([]byte, crypto.SeedMinLenDKG) + seed := make([]byte, crypto.KeyGenSeedMinLen) _, err = rand.Read(seed) if err != nil { return nil, fmt.Errorf("could not generate random seed: %w", err) diff --git a/integration/testnet/util.go b/integration/testnet/util.go index ad45be97c82..52ab6af17a0 100644 --- a/integration/testnet/util.go +++ b/integration/testnet/util.go @@ -71,7 +71,7 @@ func toNodeInfos(confs []ContainerConfig) []bootstrap.NodeInfo { } func getSeed() ([]byte, error) { - seedLen := int(math.Max(crypto.SeedMinLenDKG, crypto.KeyGenSeedMinLen)) + seedLen := int(math.Max(crypto.KeyGenSeedMinLen, crypto.KeyGenSeedMinLen)) seed := make([]byte, seedLen) n, err := rand.Read(seed) if err != nil || n != seedLen { diff --git a/module/dkg/controller_test.go b/module/dkg/controller_test.go index 03f10adf1c1..2e3b8cce8b5 100644 --- a/module/dkg/controller_test.go +++ b/module/dkg/controller_test.go @@ -248,7 +248,7 @@ func initNodes(t *testing.T, n int, phase1Duration, phase2Duration, phase3Durati logger: logger, } - seed := unittest.SeedFixture(20) + seed := unittest.SeedFixture(crypto.KeyGenSeedMinLen) dkg, err := crypto.NewJointFeldman(n, signature.RandomBeaconThreshold(n), i, broker) require.NoError(t, err) diff --git a/state/protocol/badger/validity_test.go b/state/protocol/badger/validity_test.go index 2c0e3372e4b..9d564d76e30 100644 --- a/state/protocol/badger/validity_test.go +++ b/state/protocol/badger/validity_test.go @@ -49,7 +49,7 @@ func TestEpochSetupValidity(t *testing.T) { t.Run("short seed", func(t *testing.T) { _, result, _ := unittest.BootstrapFixture(participants) setup := result.ServiceEvents[0].Event.(*flow.EpochSetup) - setup.RandomSource = unittest.SeedFixture(crypto.SeedMinLenDKG - 1) + setup.RandomSource = unittest.SeedFixture(crypto.KeyGenSeedMinLen - 1) err := verifyEpochSetup(setup, true) require.Error(t, err) From bbd3c74797ef92ed6438ea98dde842c0a5211ef4 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 2 May 2023 17:39:30 -0600 Subject: [PATCH 059/200] fix a bug when zeroring a buffer that is not a multiple of 8 bytes with BLST's vec_zero --- crypto/bls12381_utils.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index b9ec974fee3..2ef4ca2e3e2 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -277,12 +277,12 @@ void Fr_sum_vector(Fr* jointx, const Fr x[], const int len) { // internal type of BLST `pow256` uses bytes little endian. // input is bytes big endian as used by Flow crypto lib external scalars. -static void pow256_from_be_bytes(pow256 ret, const unsigned char a[Fr_BYTES]) +static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES]) { - unsigned char* b = (unsigned char*)a + Fr_BYTES - 1; + byte* b = (byte*)a + Fr_BYTES - 1; if ((uptr_t)ret == (uptr_t)a) { // swap in place for (int i=0; i Date: Tue, 2 May 2023 19:11:58 -0600 Subject: [PATCH 060/200] clean ups --- cmd/bootstrap/cmd/dkg.go | 2 +- crypto/bls12381_utils.c | 7 +++---- crypto/bls_core.c | 3 --- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/cmd/bootstrap/cmd/dkg.go b/cmd/bootstrap/cmd/dkg.go index de7f01bc6bd..da81d0551c6 100644 --- a/cmd/bootstrap/cmd/dkg.go +++ b/cmd/bootstrap/cmd/dkg.go @@ -20,7 +20,7 @@ func runDKG(nodes []model.NodeInfo) dkg.DKGData { var dkgData dkg.DKGData var err error if flagFastKG { - dkgData, err = bootstrapDKG.RunFastKG(n, flagBootstrapRandomSeed) + dkgData, err = bootstrapDKG.RunFastKG(n, GenerateRandomSeed(crypto.KeyGenSeedMinLen)) } else { dkgData, err = bootstrapDKG.RunDKG(n, GenerateRandomSeeds(n, crypto.KeyGenSeedMinLen)) } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 3a0ae79fcc9..d57d31c5861 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -116,10 +116,9 @@ prec_st* init_precomputed_data_BLS12_381() { // ------------------- Fr utilities // Montgomery constant R related to the curve order r -const Fr BLS12_381_rR = { /* R mod r = (1<<256)%r */ - TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), - TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe) -}; +// R mod r = (1<<256)%r +const Fr BLS12_381_rR = { TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \ + TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), }; // TODO: temp utility function to delete bn_st* Fr_blst_to_relic(const Fr* x) { diff --git a/crypto/bls_core.c b/crypto/bls_core.c index ca51d2dc09f..58a7287578f 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -472,7 +472,6 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, if (!sigs) goto out_sigs; for (int i=0; i < sigs_len; i++) { ep_new(sigs[i]); - ep2_new(pks[i]); } for (int i=0; i < sigs_len; i++) { @@ -501,8 +500,6 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, // r = random + 1 Fr_set_limb(&one, 1); Fr_add(&r, &r, &one); - /*char str[20]; sprintf(str, "r-%d", i); - Fr_print_(str, &r);*/ // multiply public key and signature by the same random exponent r E2_mult(&pks[i], &pks_input[i], &r); // TODO: faster version for short expos? bn_st* tmp = Fr_blst_to_relic(&r); From 9f20a59cc224d64cac53f55e5504cb23133fe453 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 3 May 2023 12:08:32 -0600 Subject: [PATCH 061/200] clean up encode/decode tests in BLS --- crypto/bls_test.go | 80 +++++++++++++++++++++------------------ crypto/sign_test_utils.go | 2 +- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/crypto/bls_test.go b/crypto/bls_test.go index c7f58fec010..c3e9bb6e9db 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -180,29 +180,35 @@ func TestBLSEncodeDecode(t *testing.T) { // specific tests for BLS // zero private key - skBytes := make([]byte, PrKeyLenBLSBLS12381) - sk, err := DecodePrivateKey(BLSBLS12381, skBytes) - require.Error(t, err, "decoding identity private key should fail") - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) - - // decode an identity public key - pkBytes := make([]byte, PubKeyLenBLSBLS12381) - pkBytes[0] = infinityPointHeader - pk, err := DecodePublicKey(BLSBLS12381, pkBytes) - require.NoError(t, err, "decoding identity public key should succeed") - assert.True(t, pk.Equals(IdentityBLSPublicKey())) - - // encode an identity public key - assert.Equal(t, pk.Encode(), pkBytes) + t.Run("zero private key", func(t *testing.T) { + skBytes := make([]byte, PrKeyLenBLSBLS12381) + sk, err := DecodePrivateKey(BLSBLS12381, skBytes) + require.Error(t, err, "decoding identity private key should fail") + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, sk) + }) + + // identity public key + t.Run("infinity public key", func(t *testing.T) { + // decode an identity public key + pkBytes := make([]byte, PubKeyLenBLSBLS12381) + pkBytes[0] = infinityPointHeader + pk, err := DecodePublicKey(BLSBLS12381, pkBytes) + require.NoError(t, err, "decoding identity public key should succeed") + assert.True(t, pk.Equals(IdentityBLSPublicKey())) + // encode an identity public key + assert.Equal(t, pk.Encode(), pkBytes) + }) // invalid point - pkBytes = make([]byte, PubKeyLenBLSBLS12381) - pkBytes[0] = invalidBLSSignatureHeader - pk, err = DecodePublicKey(BLSBLS12381, pkBytes) - require.Error(t, err, "the key decoding should fail - key value is invalid") - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, pk) + t.Run("invalid public key", func(t *testing.T) { + pkBytes := make([]byte, PubKeyLenBLSBLS12381) + pkBytes[0] = invalidBLSSignatureHeader + pk, err := DecodePublicKey(BLSBLS12381, pkBytes) + require.Error(t, err, "the key decoding should fail - key value is invalid") + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, pk) + }) // Test a public key serialization with a point encoded with a coordinate x with // x[0] or x[1] not reduced mod p. @@ -213,21 +219,23 @@ func TestBLSEncodeDecode(t *testing.T) { // Although uniqueness of public key respresentation isn't a security property, some implementations // may implicitely rely on the property. - // valid pk with x[0] < p and x[1] < p - validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, validPk) - assert.NoError(t, err) - // invalidpk1 with x[0]+p and same x[1] - invalidPk1, err := hex.DecodeString("9B8E840277BE772540D913E47A94F94C00003BBE60C4CEEB0C0ABCC9E876034089000EC7AF5AB6D81AF62EC9363D5E63038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, invalidPk1) - assert.Error(t, err) - // invalidpk1 with same x[0] and x[1]+p - invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, invalidPk2) - assert.Error(t, err) + t.Run("public key with non-reduced coordinates", func(t *testing.T) { + // valid pk with x[0] < p and x[1] < p + validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") + require.NoError(t, err) + _, err = DecodePublicKey(BLSBLS12381, validPk) + assert.NoError(t, err) + // invalidpk1 with x[0]+p and same x[1] + invalidPk1, err := hex.DecodeString("9B8E840277BE772540D913E47A94F94C00003BBE60C4CEEB0C0ABCC9E876034089000EC7AF5AB6D81AF62EC9363D5E63038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") + require.NoError(t, err) + _, err = DecodePublicKey(BLSBLS12381, invalidPk1) + assert.Error(t, err) + // invalidpk1 with same x[0] and x[1]+p + invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D") + require.NoError(t, err) + _, err = DecodePublicKey(BLSBLS12381, invalidPk2) + assert.Error(t, err) + }) } // TestBLSEquals tests equal for BLS keys diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go index 6ab599cff47..8e2cd1e931f 100644 --- a/crypto/sign_test_utils.go +++ b/crypto/sign_test_utils.go @@ -160,7 +160,7 @@ var BLS12381Order = []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01} func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { - t.Run(fmt.Sprintf("encode/decode for %s", salg), func(t *testing.T) { + t.Run(fmt.Sprintf("generic encode/decode for %s", salg), func(t *testing.T) { rand := getPRG(t) t.Run("happy path tests", func(t *testing.T) { From f2731a8a0030ba70f6e3e8d1e8f0c5fe47546a31 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 3 May 2023 12:12:31 -0600 Subject: [PATCH 062/200] add endianness test for maptoFr --- crypto/bls12381_utils.go | 2 +- crypto/bls12381_utils_test.go | 43 ++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 6bbca17f2a5..0f685494d4f 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -160,7 +160,7 @@ func randFrStar(x *scalar, rand random.Rand) { } } -// mapToFr reads a scalar from a slice of bytes and maps it to Zr. +// mapToFr reads a scalar from a slice of bytes and maps it to Fr using modular reduction. // The resulting element `k` therefore satisfies 0 <= k < r. // It returns true if scalar is zero and false otherwise. func mapToFr(x *scalar, src []byte) bool { diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 23e13d303ce..563ca26811b 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -54,7 +54,6 @@ func BenchmarkScalarMultG1G2(b *testing.B) { // Sanity-check of the map-to-G1 with regards to the IETF draft hash-to-curve func TestMapToG1(t *testing.T) { - // test vectors from https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#appendix-J.9.1 dst := []byte("QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_RO_") @@ -160,5 +159,47 @@ func BenchmarkSubgroupCheck(b *testing.B) { } b.StopTimer() }) +} +// test some edge cases of MapToFr to validate modular reduction and endianness: +// - inputs `0` and curve order `r` +// - inputs `1` and `r+1` +func TestMapToFr(t *testing.T) { + var x scalar + offset := 10 + bytes := make([]byte, frBytesLen+offset) + expectedEncoding := make([]byte, frBytesLen) + // zero bytes + isZero := mapToFr(&x, bytes) + assert.True(t, isZero) + assert.True(t, x.isZero()) + assert.Equal(t, expectedEncoding, newPrKeyBLSBLS12381(&x).Encode()) + // curve order bytes + copy(bytes[offset:], BLS12381Order) + isZero = mapToFr(&x, bytes) + assert.True(t, isZero) + assert.True(t, x.isZero()) + assert.Equal(t, expectedEncoding, newPrKeyBLSBLS12381(&x).Encode()) + // curve order + 1 + g1, err := hex.DecodeString("824aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb813e02b6052719f607dacd3a088274f65596bd0d09920b61ab5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e") + require.NoError(t, err) + bytes[len(bytes)-1] += 1 + isZero = mapToFr(&x, bytes) + assert.False(t, isZero) + assert.False(t, x.isZero()) + expectedEncoding[frBytesLen-1] = 1 + sk := newPrKeyBLSBLS12381(&x) + assert.Equal(t, expectedEncoding, sk.Encode()) + // check scalar is equal to "1" in the lower layer (scalar multiplication) + assert.Equal(t, sk.PublicKey().Encode(), g1, "scalar should be 1, check endianness in the C layer") + // 1 + copy(bytes[offset:], expectedEncoding) + isZero = mapToFr(&x, bytes) + assert.False(t, isZero) + assert.False(t, x.isZero()) + expectedEncoding[frBytesLen-1] = 1 + sk = newPrKeyBLSBLS12381(&x) + assert.Equal(t, expectedEncoding, sk.Encode()) + // check scalar is equal to "1" in the lower layer (scalar multiplication) + assert.Equal(t, sk.PublicKey().Encode(), g1, "scalar should be 1, check endianness in the C layer") } From 70c3c64734e64e8207b5fd3b72d57f1e4ce6c7ac Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 3 May 2023 12:39:54 -0600 Subject: [PATCH 063/200] add endianness comment --- crypto/bls12381_utils.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index d57d31c5861..ccec6c78d17 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -329,11 +329,12 @@ BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) { // write Fr element `a` in big endian bytes. void Fr_write_bytes(byte *bin, const Fr* a) { + // be_bytes_from_limbs works for both limb endiannesses be_bytes_from_limbs(bin, (limb_t*)a, Fr_BYTES); } // maps big-endian bytes into an Fr element using modular reduction -// Input is byte-big-endian, output is vec256 (also used as Fr) +// Input is byte-big-endian, output is Fr (internally vec256) static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) { Fr digit, radix; @@ -342,6 +343,7 @@ static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) byte* p = (byte*)bytes + n; while (n > Fr_BYTES) { + // limbs_from_be_bytes works for both limb endiannesses limbs_from_be_bytes((limb_t*)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i (i is the loop number starting at 1) Fr_add(out, out, &digit); From 21c468693a6a7fcaf03910ef6cc729772ab8b1b5 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 13:20:19 -0600 Subject: [PATCH 064/200] add blst Go package as an internal test package --- crypto/internal/blst/blst.go | 3346 +++++++++++++++++++++++++++++++ crypto/internal/blst/blst.h | 483 +++++ crypto/internal/blst/blst_aux.h | 111 + 3 files changed, 3940 insertions(+) create mode 100644 crypto/internal/blst/blst.go create mode 100644 crypto/internal/blst/blst.h create mode 100644 crypto/internal/blst/blst_aux.h diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go new file mode 100644 index 00000000000..97b9047d1e3 --- /dev/null +++ b/crypto/internal/blst/blst.go @@ -0,0 +1,3346 @@ +/* + * Copied from https://github.com/supranational/blst + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +// #cgo CFLAGS: -I${SRCDIR} -I${SRCDIR}/../../blst_src/build -I${SRCDIR}/../../blst_src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset +// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx +// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ +// #include "blst.h" +// +// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) +// # include +// # include +// static void handler(int signum) +// { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " +// "consult /bindings/go/README.md.\n", 70); +// _exit(128+SIGILL); +// (void)n; +// } +// __attribute__((constructor)) static void blst_cgo_init() +// { blst_fp temp = { 0 }; +// struct sigaction act = { handler }, oact; +// sigaction(SIGILL, &act, &oact); +// blst_fp_sqr(&temp, &temp); +// sigaction(SIGILL, &oact, NULL); +// } +// #endif +// +// static size_t go_pairing_sizeof(size_t DST_len) +// { return (blst_pairing_sizeof() + DST_len + sizeof(blst_pairing) - 1) / +// sizeof(blst_pairing); +// } +// static void go_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, +// const byte *DST, size_t DST_len) +// { if (DST != NULL) { +// byte *dst = (byte*)new_ctx + blst_pairing_sizeof(); +// for(size_t i = 0; i < DST_len; i++) dst[i] = DST[i]; +// DST = dst; +// } +// blst_pairing_init(new_ctx, hash_or_encode, DST, DST_len); +// } +// static void go_pairing_as_fp12(blst_fp12 *pt, blst_pairing *ctx) +// { *pt = *blst_pairing_as_fp12(ctx); } +// +// static void go_p1slice_to_affine(blst_p1_affine dst[], +// const blst_p1 points[], size_t npoints) +// { const blst_p1 *ppoints[2] = { points, NULL }; +// blst_p1s_to_affine(dst, ppoints, npoints); +// } +// static void go_p1slice_add(blst_p1 *dst, const blst_p1_affine points[], +// size_t npoints) +// { const blst_p1_affine *ppoints[2] = { points, NULL }; +// blst_p1s_add(dst, ppoints, npoints); +// } +// static void go_p2slice_to_affine(blst_p2_affine dst[], +// const blst_p2 points[], size_t npoints) +// { const blst_p2 *ppoints[2] = { points, NULL }; +// blst_p2s_to_affine(dst, ppoints, npoints); +// } +// static void go_p2slice_add(blst_p2 *dst, const blst_p2_affine points[], +// size_t npoints) +// { const blst_p2_affine *ppoints[2] = { points, NULL }; +// blst_p2s_add(dst, ppoints, npoints); +// } +// +// static void go_p1_mult_n_acc(blst_p1 *acc, const blst_fp *x, bool affine, +// const byte *scalar, size_t nbits) +// { blst_p1 m[1]; +// const void *p = x; +// if (p == NULL) +// p = blst_p1_generator(); +// else if (affine) +// blst_p1_from_affine(m, p), p = m; +// blst_p1_mult(m, p, scalar, nbits); +// blst_p1_add_or_double(acc, acc, m); +// } +// static void go_p2_mult_n_acc(blst_p2 *acc, const blst_fp2 *x, bool affine, +// const byte *scalar, size_t nbits) +// { blst_p2 m[1]; +// const void *p = x; +// if (p == NULL) +// p = blst_p2_generator(); +// else if (affine) +// blst_p2_from_affine(m, p), p = m; +// blst_p2_mult(m, p, scalar, nbits); +// blst_p2_add_or_double(acc, acc, m); +// } +// +// static void go_p1_sub_assign(blst_p1 *a, const blst_fp *x, bool affine) +// { blst_p1 minus_b; +// if (affine) +// blst_p1_from_affine(&minus_b, (const blst_p1_affine*)x); +// else +// minus_b = *(const blst_p1*)x; +// blst_p1_cneg(&minus_b, 1); +// blst_p1_add_or_double(a, a, &minus_b); +// } +// +// static void go_p2_sub_assign(blst_p2 *a, const blst_fp2 *x, bool affine) +// { blst_p2 minus_b; +// if (affine) +// blst_p2_from_affine(&minus_b, (const blst_p2_affine*)x); +// else +// minus_b = *(const blst_p2*)x; +// blst_p2_cneg(&minus_b, 1); +// blst_p2_add_or_double(a, a, &minus_b); +// } +// +// static bool go_scalar_from_bendian(blst_scalar *ret, const byte *in) +// { blst_scalar_from_bendian(ret, in); +// return blst_sk_check(ret); +// } +// static bool go_hash_to_scalar(blst_scalar *ret, +// const byte *msg, size_t msg_len, +// const byte *DST, size_t DST_len) +// { byte elem[48]; +// blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len); +// return blst_scalar_from_be_bytes(ret, elem, sizeof(elem)); +// } +import "C" +import ( + "fmt" + "math/bits" + "runtime" + "sync" + "sync/atomic" +) + +const BLST_SCALAR_BYTES = 256 / 8 +const BLST_FP_BYTES = 384 / 8 +const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES +const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 + +type Scalar = C.blst_scalar +type Fp = C.blst_fp +type Fp2 = C.blst_fp2 +type Fp6 = C.blst_fp6 +type Fp12 = C.blst_fp12 +type P1 = C.blst_p1 +type P2 = C.blst_p2 +type P1Affine = C.blst_p1_affine +type P2Affine = C.blst_p2_affine +type Message = []byte +type Pairing = []C.blst_pairing +type SecretKey = Scalar +type P1s []P1 +type P2s []P2 +type P1Affines []P1Affine +type P2Affines []P2Affine + +// +// Configuration +// + +var maxProcs = initMaxProcs() + +func initMaxProcs() int { + maxProcs := runtime.GOMAXPROCS(0) + var version float32 + _, err := fmt.Sscanf(runtime.Version(), "go%f", &version) + if err != nil || version < 1.14 { + // be cooperative and leave one processor for the application + maxProcs -= 1 + } + if maxProcs <= 0 { + maxProcs = 1 + } + return maxProcs +} + +func SetMaxProcs(max int) { + if max <= 0 { + max = 1 + } + maxProcs = max +} + +// Secret key +func (sk *SecretKey) Zeroize() { + var zero SecretKey + *sk = zero +} + +func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + if len(info) > 0 { + infoP = (*C.byte)(&info[0]) + } + } + if len(ikm) < 32 { + return nil + } + C.blst_keygen(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + infoP, C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV3(ikm []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + if len(info) > 0 { + infoP = (*C.byte)(&info[0]) + } + } + C.blst_keygen_v3(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + infoP, C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV45(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + if len(info) > 0 { + infoP = (*C.byte)(&info[0]) + } + } + C.blst_keygen_v4_5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + (*C.byte)(&salt[0]), C.size_t(len(salt)), + infoP, C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV5(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + if len(info) > 0 { + infoP = (*C.byte)(&info[0]) + } + } + C.blst_keygen_v5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + (*C.byte)(&salt[0]), C.size_t(len(salt)), + infoP, C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func DeriveMasterEip2333(ikm []byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + C.blst_derive_master_eip2333(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func (master *SecretKey) DeriveChildEip2333(child_index uint32) *SecretKey { + var sk SecretKey + C.blst_derive_child_eip2333(&sk, master, C.uint(child_index)) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +// Pairing +func PairingCtx(hash_or_encode bool, DST []byte) Pairing { + DST_len := C.size_t(len(DST)) + ctx := make([]C.blst_pairing, int(C.go_pairing_sizeof(DST_len))) + var uDST *C.byte + if DST_len > 0 { + uDST = (*C.byte)(&DST[0]) + } + C.go_pairing_init(&ctx[0], C.bool(hash_or_encode), uDST, DST_len) + return ctx +} + +func PairingCommit(ctx Pairing) { + C.blst_pairing_commit(&ctx[0]) +} + +func PairingMerge(ctx Pairing, ctx1 Pairing) int { + r := C.blst_pairing_merge(&ctx[0], &ctx1[0]) + return int(r) +} + +func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { + var gtsig *Fp12 = nil + if len(optional) > 0 { + gtsig = optional[0] + } + return bool(C.blst_pairing_finalverify(&ctx[0], gtsig)) +} + +func PairingRawAggregate(ctx Pairing, q *P2Affine, p *P1Affine) { + C.blst_pairing_raw_aggregate(&ctx[0], q, p) +} + +func PairingAsFp12(ctx Pairing) *Fp12 { + var pt Fp12 + C.go_pairing_as_fp12(&pt, &ctx[0]) + return &pt +} + +func Fp12One() Fp12 { + return *C.blst_fp12_one() +} + +func Fp12FinalVerify(pt1 *Fp12, pt2 *Fp12) bool { + return bool(C.blst_fp12_finalverify(pt1, pt2)) +} + +func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 { + var pt Fp12 + C.blst_miller_loop(&pt, q, p) + return &pt +} + +func (pt *Fp12) MulAssign(p *Fp12) { + C.blst_fp12_mul(pt, pt, p) +} + +func (pt *Fp12) FinalExp() { + C.blst_final_exp(pt, pt) +} + +func (pt *Fp12) InGroup() bool { + return bool(C.blst_fp12_in_group(pt)) +} + +func (pt *Fp12) ToBendian() []byte { + var out [BLST_FP_BYTES * 12]byte + C.blst_bendian_from_fp12((*C.byte)(&out[0]), pt) + return out[:] +} + +// +// MIN-PK +// + +// +// PublicKey +// + +func (pk *P1Affine) From(s *Scalar) *P1Affine { + C.blst_sk_to_pk2_in_g1(nil, pk, s) + return pk +} + +func (pk *P1Affine) KeyValidate() bool { + return !bool(C.blst_p1_affine_is_inf(pk)) && + bool(C.blst_p1_affine_in_g1(pk)) +} + +// sigInfcheck, check for infinity, is a way to avoid going +// into resource-consuming verification. Passing 'false' is +// always cryptographically safe, but application might want +// to guard against obviously bogus individual[!] signatures. +func (sig *P2Affine) SigValidate(sigInfcheck bool) bool { + return (sigInfcheck && !bool(C.blst_p2_affine_is_inf(sig))) || + bool(C.blst_p2_affine_in_g2(sig)) +} + +// +// Sign +// + +func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P2Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P2 + if useHash { + q = HashToG2(msg, dst, augSingle) + } else { + q = EncodeToG2(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g1(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP2 func() *P2Affine +type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) + +// Single verify with decompressed pk +func (sig *P2Affine) Verify(sigGroupcheck bool, pk *P1Affine, pkValidate bool, + msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify(sigGroupcheck, []*P1Affine{pk}, pkValidate, + []Message{msg}, dst, useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, + pk []byte, pkValidate bool, msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, + [][]byte{pk}, pkValidate, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +// Note that checking message uniqueness, if required, is left to the user. +// Not all signature schemes require it and this keeps the binding minimal +// and fast. Refer to the Uniq function for one method method of performing +// this check. +func (sig *P2Affine) AggregateVerify(sigGroupcheck bool, + pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + sigFn := func() *P2Affine { + return sig + } + + pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } else { + return pks[i], nil + } + } + + return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, + pks [][]byte, pksVerify bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P2Affine { + sigP := new(P2Affine) + if sigP.Uncompress(sig) == nil { + return nil + } + return sigP + } + pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { + bytes := pks[i] + if len(bytes) == BLST_P1_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else if len(bytes) == BLST_P1_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } else { + return nil, nil + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, + pkFn pkGetterP1, pkValidate bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + mutex := sync.Mutex{} + + mutex.Lock() + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } else if work == 0 && maxProcs == numCores-1 && + numThreads == maxProcs { + // Avoid consuming all cores by waiting until the + // main thread has completed its miller loop before + // proceeding. + mutex.Lock() + mutex.Unlock() + } + + // Pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + ret := PairingAggregatePkInG1(pairing, curPk, pkValidate, + nil, false, msgs[work], aug) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && + !sig.SigValidate(false) { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 { + C.blst_aggregated_in_g2(>sig, sig) + } + mutex.Unlock() + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func CoreVerifyPkInG1(pk *P1Affine, sig *P2Affine, hash_or_encode bool, + msg Message, dst []byte, optional ...[]byte) int { + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + if runtime.NumGoroutine() < maxProcs { + sigFn := func() *P2Affine { + return sig + } + pkFn := func(_ uint32, _ *P1Affine) (*P1Affine, []byte) { + return pk, aug + } + if !coreAggregateVerifyPkInG1(sigFn, true, pkFn, true, []Message{msg}, + dst, hash_or_encode) { + return C.BLST_VERIFY_FAIL + } + return C.BLST_SUCCESS + } + + var udst *C.byte + if len(dst) > 0 { + udst = (*C.byte)(&dst[0]) + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + return int(C.blst_core_verify_pk_in_g1(pk, sig, C.bool(hash_or_encode), + umsg, C.size_t(len(msg)), + udst, C.size_t(len(dst)), + uaug, C.size_t(len(aug)))) +} + +// pks are assumed to be verified for proof of possession, +// which implies that they are already group-checked +func (sig *P2Affine) FastAggregateVerify(sigGroupcheck bool, + pks []*P1Affine, msg Message, dst []byte, + optional ...interface{}) bool { // pass-through to Verify + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P1Aggregate) + if !aggregator.Aggregate(pks, false) { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) +} + +func (dummy *P2Affine) MultipleAggregateVerify(sigs []*P2Affine, + sigsGroupcheck bool, pks []*P1Affine, pksVerify bool, + msgs []Message, dst []byte, randFn func(*Scalar), randBits int, + optional ...interface{}) bool { // useHash + + // Sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n || len(sigs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + paramsFn := + func(work uint32, sig *P2Affine, pk *P1Affine, rand *Scalar) ( + *P2Affine, *P1Affine, *Scalar, []byte) { + randFn(rand) + var aug []byte + if useAugs { + aug = augs[work] + } + return sigs[work], pks[work], rand, aug + } + + return multipleAggregateVerifyPkInG1(paramsFn, sigsGroupcheck, pksVerify, + msgs, dst, randBits, useHash) +} + +type mulAggGetterPkInG1 func(work uint32, sig *P2Affine, pk *P1Affine, + rand *Scalar) (*P2Affine, *P1Affine, *Scalar, []byte) + +func multipleAggregateVerifyPkInG1(paramsFn mulAggGetterPkInG1, + sigsGroupcheck bool, pksVerify bool, msgs []Message, + dst []byte, randBits int, + optional ...bool) bool { // useHash + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var tempRand Scalar + var tempPk P1Affine + var tempSig P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + curSig, curPk, curRand, aug := paramsFn(work, &tempSig, + &tempPk, &tempRand) + + if PairingMulNAggregatePkInG1(pairing, curPk, pksVerify, + curSig, sigsGroupcheck, curRand, + randBits, msgs[work], aug) != + C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, nil) +} + +// +// Aggregate P2 +// + +type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine +type P2Aggregate struct { + v *P2 +} + +// Aggregate uncompressed elements +func (agg *P2Aggregate) Aggregate(elmts []*P2Affine, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } + return agg.aggregate(getter, groupcheck, len(elmts)) +} + +// Aggregate compressed elements +func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, p *P2Affine) *P2Affine { + bytes := elmts[i] + if p.Uncompress(bytes) == nil { + return nil + } + return p + } + return agg.aggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p2_add_or_double(agg.v, agg.v, other.v) + } +} + +func (agg *P2Aggregate) Add(elmt *P2Affine, groupcheck bool) bool { + if groupcheck && !bool(C.blst_p2_affine_in_g2(elmt)) { + return false + } + if agg.v == nil { + agg.v = new(P2) + C.blst_p2_from_affine(agg.v, elmt) + } else { + C.blst_p2_add_or_double_affine(agg.v, agg.v, elmt) + } + return true +} + +func (agg *P2Aggregate) ToAffine() *P2Affine { + if agg.v == nil { + return new(P2Affine) + } + return agg.v.ToAffine() +} + +func (agg *P2Aggregate) aggregate(getter aggGetterP2, groupcheck bool, + n int) bool { + + if n == 0 { + return true + } + // operations are considered short enough for not to care about + // keeping one core free... + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P2 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P2 + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if groupcheck && !bool(C.blst_p2_affine_in_g2(curElmt)) { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p2_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p2_add_or_double_affine(&agg, &agg, curElmt) + } + // application might have some async work to do + runtime.Gosched() + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p2_add_or_double(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} + +// +// MIN-SIG +// + +// +// PublicKey +// + +func (pk *P2Affine) From(s *Scalar) *P2Affine { + C.blst_sk_to_pk2_in_g2(nil, pk, s) + return pk +} + +func (pk *P2Affine) KeyValidate() bool { + return !bool(C.blst_p2_affine_is_inf(pk)) && + bool(C.blst_p2_affine_in_g2(pk)) +} + +// sigInfcheck, check for infinity, is a way to avoid going +// into resource-consuming verification. Passing 'false' is +// always cryptographically safe, but application might want +// to guard against obviously bogus individual[!] signatures. +func (sig *P1Affine) SigValidate(sigInfcheck bool) bool { + return (sigInfcheck && !bool(C.blst_p1_affine_is_inf(sig))) || + bool(C.blst_p1_affine_in_g1(sig)) +} + +// +// Sign +// + +func (sig *P1Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P1Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P1 + if useHash { + q = HashToG1(msg, dst, augSingle) + } else { + q = EncodeToG1(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g2(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP1 func() *P1Affine +type pkGetterP2 func(i uint32, temp *P2Affine) (*P2Affine, []byte) + +// Single verify with decompressed pk +func (sig *P1Affine) Verify(sigGroupcheck bool, pk *P2Affine, pkValidate bool, + msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify(sigGroupcheck, []*P2Affine{pk}, pkValidate, + []Message{msg}, dst, useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P1Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, + pk []byte, pkValidate bool, msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, + [][]byte{pk}, pkValidate, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +// Note that checking message uniqueness, if required, is left to the user. +// Not all signature schemes require it and this keeps the binding minimal +// and fast. Refer to the Uniq function for one method method of performing +// this check. +func (sig *P1Affine) AggregateVerify(sigGroupcheck bool, + pks []*P2Affine, pksVerify bool, msgs []Message, dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + sigFn := func() *P1Affine { + return sig + } + + pkFn := func(i uint32, _ *P2Affine) (*P2Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } else { + return pks[i], nil + } + } + + return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (dummy *P1Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, + pks [][]byte, pksVerify bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P1Affine { + sigP := new(P1Affine) + if sigP.Uncompress(sig) == nil { + return nil + } + return sigP + } + pkFn := func(i uint32, pk *P2Affine) (*P2Affine, []byte) { + bytes := pks[i] + if len(bytes) == BLST_P2_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else if len(bytes) == BLST_P2_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } else { + return nil, nil + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool, + pkFn pkGetterP2, pkValidate bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + mutex := sync.Mutex{} + + mutex.Lock() + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } else if work == 0 && maxProcs == numCores-1 && + numThreads == maxProcs { + // Avoid consuming all cores by waiting until the + // main thread has completed its miller loop before + // proceeding. + mutex.Lock() + mutex.Unlock() + } + + // Pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + ret := PairingAggregatePkInG2(pairing, curPk, pkValidate, + nil, false, msgs[work], aug) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && + !sig.SigValidate(false) { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 { + C.blst_aggregated_in_g1(>sig, sig) + } + mutex.Unlock() + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func CoreVerifyPkInG2(pk *P2Affine, sig *P1Affine, hash_or_encode bool, + msg Message, dst []byte, optional ...[]byte) int { + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + if runtime.NumGoroutine() < maxProcs { + sigFn := func() *P1Affine { + return sig + } + pkFn := func(_ uint32, _ *P2Affine) (*P2Affine, []byte) { + return pk, aug + } + if !coreAggregateVerifyPkInG2(sigFn, true, pkFn, true, []Message{msg}, + dst, hash_or_encode) { + return C.BLST_VERIFY_FAIL + } + return C.BLST_SUCCESS + } + + var udst *C.byte + if len(dst) > 0 { + udst = (*C.byte)(&dst[0]) + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + return int(C.blst_core_verify_pk_in_g2(pk, sig, C.bool(hash_or_encode), + umsg, C.size_t(len(msg)), + udst, C.size_t(len(dst)), + uaug, C.size_t(len(aug)))) +} + +// pks are assumed to be verified for proof of possession, +// which implies that they are already group-checked +func (sig *P1Affine) FastAggregateVerify(sigGroupcheck bool, + pks []*P2Affine, msg Message, dst []byte, + optional ...interface{}) bool { // pass-through to Verify + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P2Aggregate) + if !aggregator.Aggregate(pks, false) { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) +} + +func (dummy *P1Affine) MultipleAggregateVerify(sigs []*P1Affine, + sigsGroupcheck bool, pks []*P2Affine, pksVerify bool, + msgs []Message, dst []byte, randFn func(*Scalar), randBits int, + optional ...interface{}) bool { // useHash + + // Sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n || len(sigs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + paramsFn := + func(work uint32, sig *P1Affine, pk *P2Affine, rand *Scalar) ( + *P1Affine, *P2Affine, *Scalar, []byte) { + randFn(rand) + var aug []byte + if useAugs { + aug = augs[work] + } + return sigs[work], pks[work], rand, aug + } + + return multipleAggregateVerifyPkInG2(paramsFn, sigsGroupcheck, pksVerify, + msgs, dst, randBits, useHash) +} + +type mulAggGetterPkInG2 func(work uint32, sig *P1Affine, pk *P2Affine, + rand *Scalar) (*P1Affine, *P2Affine, *Scalar, []byte) + +func multipleAggregateVerifyPkInG2(paramsFn mulAggGetterPkInG2, + sigsGroupcheck bool, pksVerify bool, msgs []Message, + dst []byte, randBits int, + optional ...bool) bool { // useHash + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var tempRand Scalar + var tempPk P2Affine + var tempSig P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + curSig, curPk, curRand, aug := paramsFn(work, &tempSig, + &tempPk, &tempRand) + + if PairingMulNAggregatePkInG2(pairing, curPk, pksVerify, + curSig, sigsGroupcheck, curRand, + randBits, msgs[work], aug) != + C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, nil) +} + +// +// Aggregate P1 +// + +type aggGetterP1 func(i uint32, temp *P1Affine) *P1Affine +type P1Aggregate struct { + v *P1 +} + +// Aggregate uncompressed elements +func (agg *P1Aggregate) Aggregate(elmts []*P1Affine, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, _ *P1Affine) *P1Affine { return elmts[i] } + return agg.aggregate(getter, groupcheck, len(elmts)) +} + +// Aggregate compressed elements +func (agg *P1Aggregate) AggregateCompressed(elmts [][]byte, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, p *P1Affine) *P1Affine { + bytes := elmts[i] + if p.Uncompress(bytes) == nil { + return nil + } + return p + } + return agg.aggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P1Aggregate) AddAggregate(other *P1Aggregate) { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p1_add_or_double(agg.v, agg.v, other.v) + } +} + +func (agg *P1Aggregate) Add(elmt *P1Affine, groupcheck bool) bool { + if groupcheck && !bool(C.blst_p1_affine_in_g1(elmt)) { + return false + } + if agg.v == nil { + agg.v = new(P1) + C.blst_p1_from_affine(agg.v, elmt) + } else { + C.blst_p1_add_or_double_affine(agg.v, agg.v, elmt) + } + return true +} + +func (agg *P1Aggregate) ToAffine() *P1Affine { + if agg.v == nil { + return new(P1Affine) + } + return agg.v.ToAffine() +} + +func (agg *P1Aggregate) aggregate(getter aggGetterP1, groupcheck bool, + n int) bool { + + if n == 0 { + return true + } + // operations are considered short enough for not to care about + // keeping one core free... + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P1 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P1 + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if groupcheck && !bool(C.blst_p1_affine_in_g1(curElmt)) { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p1_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p1_add_or_double_affine(&agg, &agg, curElmt) + } + // application might have some async work to do + runtime.Gosched() + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p1_add_or_double(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} +func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, + sig *P2Affine, sigGroupcheck bool, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_chk_n_aggr_pk_in_g1(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + umsg, C.size_t(len(msg)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +func PairingMulNAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, + sig *P2Affine, sigGroupcheck bool, + rand *Scalar, randBits int, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g1(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + &rand.b[0], C.size_t(randBits), + umsg, C.size_t(len(msg)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +// +// Serialization/Deserialization. +// + +// P1 Serdes +func (p1 *P1Affine) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_affine_serialize((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { + if len(in) != BLST_P1_SERIALIZE_BYTES { + return nil + } + if C.blst_p1_deserialize(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p1 +} +func (p1 *P1Affine) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_affine_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { + if len(in) != BLST_P1_COMPRESS_BYTES { + return nil + } + if C.blst_p1_uncompress(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p1 +} + +func (p1 *P1Affine) InG1() bool { + return bool(C.blst_p1_affine_in_g1(p1)) +} + +func (dummy *P1Affine) BatchUncompress(in [][]byte) []*P1Affine { + // Allocate space for all of the resulting points. Later we'll save pointers + // and return those so that the result could be used in other functions, + // such as MultipleAggregateVerify. + n := len(in) + points := make([]P1Affine, n) + pointsPtrs := make([]*P1Affine, n) + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding point, and + // repeat until n is exceeded. Each thread will send a result (true for + // success, false for failure) into the channel when complete. + resCh := make(chan bool, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + if points[work].Uncompress(in[work]) == nil { + atomic.StoreInt32(&valid, 0) + break + } + pointsPtrs[work] = &points[work] + } + if atomic.LoadInt32(&valid) > 0 { + resCh <- true + } else { + resCh <- false + } + }() + } + + // Collect the threads + result := true + for i := 0; i < numThreads; i++ { + if !<-resCh { + result = false + } + } + if atomic.LoadInt32(&valid) == 0 || !result { + return nil + } + return pointsPtrs +} + +func (p1 *P1) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_serialize((*C.byte)(&out[0]), p1) + return out[:] +} +func (p1 *P1) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1) MultAssign(scalarIf interface{}, optional ...int) *P1 { + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.blst_p1_mult(p1, p1, scalar, C.size_t(nbits)) + return p1 +} + +func (p1 *P1) Mult(scalarIf interface{}, optional ...int) *P1 { + ret := *p1 + return ret.MultAssign(scalarIf, optional...) +} + +func (p1 *P1) AddAssign(pointIf interface{}) *P1 { + switch val := pointIf.(type) { + case *P1: + C.blst_p1_add_or_double(p1, p1, val) + case *P1Affine: + C.blst_p1_add_or_double_affine(p1, p1, val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + return p1 +} + +func (p1 *P1) Add(pointIf interface{}) *P1 { + ret := *p1 + return ret.AddAssign(pointIf) +} + +func (p1 *P1) SubAssign(pointIf interface{}) *P1 { + var x *Fp + var affine C.bool + switch val := pointIf.(type) { + case *P1: + x = &val.x + affine = false + case *P1Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + C.go_p1_sub_assign(p1, x, affine) + return p1 +} + +func (p1 *P1) Sub(pointIf interface{}) *P1 { + ret := *p1 + return ret.SubAssign(pointIf) +} + +func P1Generator() *P1 { + return C.blst_p1_generator() +} + +// 'acc += point * scalar', passing 'nil' for 'point' means "use the +// +// group generator point" +func (acc *P1) MultNAccumulate(pointIf interface{}, scalarIf interface{}, + optional ...int) *P1 { + var x *Fp + var affine C.bool + if pointIf != nil { + switch val := pointIf.(type) { + case *P1: + x = &val.x + affine = false + case *P1Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + } + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.go_p1_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) + return acc +} + +// +// Affine +// + +func (p *P1) ToAffine() *P1Affine { + var pa P1Affine + C.blst_p1_to_affine(&pa, p) + return &pa +} + +func (p *P1) FromAffine(pa *P1Affine) { + C.blst_p1_from_affine(p, pa) +} + +// Hash +func HashToG1(msg []byte, dst []byte, + optional ...[]byte) *P1 { // aug + var q P1 + + // Handle zero length message + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + var aug []byte + var augC *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + augC = (*C.byte)(&aug[0]) + } + } + + C.blst_hash_to_g1(&q, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst)), + augC, C.size_t(len(aug))) + return &q +} + +func EncodeToG1(msg []byte, dst []byte, + optional ...[]byte) *P1 { // aug + var q P1 + + // Handle zero length message + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + var aug []byte + var augC *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + augC = (*C.byte)(&aug[0]) + } + } + + C.blst_encode_to_g1(&q, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst)), + augC, C.size_t(len(aug))) + return &q +} + +// +// Multi-point/scalar operations +// + +func P1sToAffine(points []*P1, optional ...int) P1Affines { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + ret := make([]P1Affine, npoints) + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret +} + +func (points P1s) ToAffine(optional ...P1Affines) P1Affines { + npoints := len(points) + var ret P1Affines + + if len(optional) > 0 { // used in benchmark + ret = optional[0] + if len(ret) < npoints { + panic("npoints mismatch") + } + } else { + ret = make([]P1Affine, npoints) + } + + if maxProcs < 2 || npoints < 768 { + C.go_p1slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + var wg sync.WaitGroup + wg.Add(nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(out *P1Affine, inp *P1, delta int) { + C.go_p1slice_to_affine(out, inp, C.size_t(delta)) + wg.Done() + }(&ret[x], &points[x], delta) + } + wg.Wait() + + return ret +} + +// +// Batch addition +// + +func P1AffinesAdd(points []*P1Affine, optional ...int) *P1 { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + var ret P1 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_add(&ret, &points[0], C.size_t(npoints)) + return &ret +} + +func (points P1Affines) Add() *P1 { + npoints := len(points) + if maxProcs < 2 || npoints < 768 { + var ret P1 + C.go_p1slice_add(&ret, &points[0], C.size_t(npoints)) + return &ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + msgs := make(chan P1, nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(points *P1Affine, delta int) { + var ret P1 + C.go_p1slice_add(&ret, points, C.size_t(delta)) + msgs <- ret + }(&points[x], delta) + } + + ret := <-msgs + for i := 1; i < nslices; i++ { + msg := <-msgs + C.blst_p1_add_or_double(&ret, &ret, &msg) + } + return &ret +} + +func (points P1s) Add() *P1 { + return points.ToAffine().Add() +} + +// +// Multi-scalar multiplication +// + +func P1AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P1 { + var npoints int + switch val := pointsIf.(type) { + case []*P1Affine: + npoints = len(val) + case []P1Affine: + npoints = len(val) + case P1Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + nbytes := (nbits + 7) / 8 + var scalars []*C.byte + switch val := scalarsIf.(type) { + case []byte: + if len(val) < npoints*nbytes { + return nil + } + case [][]byte: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = (*C.byte)(&val[i][0]) + } + case []Scalar: + if len(val) < npoints { + return nil + } + if nbits <= 248 { + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + } + case []*Scalar: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := maxProcs + numCores := runtime.GOMAXPROCS(0) + if numCores < maxProcs { + numThreads = numCores + } + + if numThreads < 2 || npoints < 32 { + sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 + scratch := make([]uint64, sz) + + pointsBySlice := [2]*P1Affine{nil, nil} + var p_points **P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + p_points = &val[0] + case []P1Affine: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + case P1Affines: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + } + + scalarsBySlice := [2]*C.byte{nil, nil} + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[0]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[0] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = (*C.byte)(&val[0].b[0]) + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[0] + } + case []*Scalar: + p_scalars = &scalars[0] + } + + var ret P1 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_mult_pippenger(&ret, p_points, C.size_t(npoints), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0])) + + for i := range scalars { + scalars[i] = nil + } + + return &ret + } + + // this is sizeof(scratch[0]) + sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(0)) / 8 + + nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), + numThreads) + + // |grid[]| holds "coordinates" and place for result + grid := make([]struct { + x, dx, y, dy int + point P1 + }, nx*ny) + + dx := npoints / nx + y := window * (ny - 1) + total := 0 + for ; total < nx; total++ { + grid[total].x = total * dx + grid[total].dx = dx + grid[total].y = y + grid[total].dy = nbits - y + } + grid[total-1].dx = npoints - grid[total-1].x + + for y > 0 { + y -= window + for i := 0; i < nx; i++ { + grid[total].x = grid[i].x + grid[total].dx = grid[i].dx + grid[total].y = y + grid[total].dy = window + total++ + } + } + + if numThreads > total { + numThreads = total + } + + msgsCh := make(chan int, ny) + rowSync := make([]int32, ny) // count up to |nx| + curItem := int32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + scratch := make([]uint64, sz<= total { + break + } + + x := grid[workItem].x + y := grid[workItem].y + + var p_points **P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + p_points = &val[x] + case []P1Affine: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + case P1Affines: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + } + + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[x] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = (*C.byte)(&val[x].b[0]) + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[x] + } + case []*Scalar: + p_scalars = &scalars[x] + } + + C.blst_p1s_tile_pippenger(&grid[workItem].point, + p_points, C.size_t(grid[workItem].dx), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0]), + C.size_t(y), C.size_t(window)) + + if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { + msgsCh <- y // "row" is done + } else { + runtime.Gosched() // be nice to the application + } + } + + pointsBySlice[0] = nil + scalarsBySlice[0] = nil + }() + } + + var ret P1 + rows := make([]bool, ny) + row := 0 // actually index in |grid[]| + for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" + y := <-msgsCh + rows[y/window] = true // mark the "row" + for grid[row].y == y { // if it's current "row", process it + for row < total && grid[row].y == y { + C.blst_p1_add_or_double(&ret, &ret, &grid[row].point) + row++ + } + if y == 0 { + break // one can as well 'return &ret' here + } + for j := 0; j < window; j++ { + C.blst_p1_double(&ret, &ret) + } + y -= window + if !rows[y/window] { // see if next "row" was marked already + break + } + } + } + + for i := range scalars { + scalars[i] = nil + } + + return &ret +} + +func (points P1Affines) Mult(scalarsIf interface{}, nbits int) *P1 { + return P1AffinesMult(points, scalarsIf, nbits) +} + +func (points P1s) Mult(scalarsIf interface{}, nbits int) *P1 { + return points.ToAffine().Mult(scalarsIf, nbits) +} +func PairingAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, + sig *P1Affine, sigGroupcheck bool, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_chk_n_aggr_pk_in_g2(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + umsg, C.size_t(len(msg)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +func PairingMulNAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, + sig *P1Affine, sigGroupcheck bool, + rand *Scalar, randBits int, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g2(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + &rand.b[0], C.size_t(randBits), + umsg, C.size_t(len(msg)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +// +// Serialization/Deserialization. +// + +// P2 Serdes +func (p2 *P2Affine) Serialize() []byte { + var out [BLST_P2_SERIALIZE_BYTES]byte + C.blst_p2_affine_serialize((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2Affine) Deserialize(in []byte) *P2Affine { + if len(in) != BLST_P2_SERIALIZE_BYTES { + return nil + } + if C.blst_p2_deserialize(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p2 +} +func (p2 *P2Affine) Compress() []byte { + var out [BLST_P2_COMPRESS_BYTES]byte + C.blst_p2_affine_compress((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2Affine) Uncompress(in []byte) *P2Affine { + if len(in) != BLST_P2_COMPRESS_BYTES { + return nil + } + if C.blst_p2_uncompress(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p2 +} + +func (p2 *P2Affine) InG2() bool { + return bool(C.blst_p2_affine_in_g2(p2)) +} + +func (dummy *P2Affine) BatchUncompress(in [][]byte) []*P2Affine { + // Allocate space for all of the resulting points. Later we'll save pointers + // and return those so that the result could be used in other functions, + // such as MultipleAggregateVerify. + n := len(in) + points := make([]P2Affine, n) + pointsPtrs := make([]*P2Affine, n) + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding point, and + // repeat until n is exceeded. Each thread will send a result (true for + // success, false for failure) into the channel when complete. + resCh := make(chan bool, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + if points[work].Uncompress(in[work]) == nil { + atomic.StoreInt32(&valid, 0) + break + } + pointsPtrs[work] = &points[work] + } + if atomic.LoadInt32(&valid) > 0 { + resCh <- true + } else { + resCh <- false + } + }() + } + + // Collect the threads + result := true + for i := 0; i < numThreads; i++ { + if !<-resCh { + result = false + } + } + if atomic.LoadInt32(&valid) == 0 || !result { + return nil + } + return pointsPtrs +} + +func (p2 *P2) Serialize() []byte { + var out [BLST_P2_SERIALIZE_BYTES]byte + C.blst_p2_serialize((*C.byte)(&out[0]), p2) + return out[:] +} +func (p2 *P2) Compress() []byte { + var out [BLST_P2_COMPRESS_BYTES]byte + C.blst_p2_compress((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2) MultAssign(scalarIf interface{}, optional ...int) *P2 { + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.blst_p2_mult(p2, p2, scalar, C.size_t(nbits)) + return p2 +} + +func (p2 *P2) Mult(scalarIf interface{}, optional ...int) *P2 { + ret := *p2 + return ret.MultAssign(scalarIf, optional...) +} + +func (p2 *P2) AddAssign(pointIf interface{}) *P2 { + switch val := pointIf.(type) { + case *P2: + C.blst_p2_add_or_double(p2, p2, val) + case *P2Affine: + C.blst_p2_add_or_double_affine(p2, p2, val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + return p2 +} + +func (p2 *P2) Add(pointIf interface{}) *P2 { + ret := *p2 + return ret.AddAssign(pointIf) +} + +func (p2 *P2) SubAssign(pointIf interface{}) *P2 { + var x *Fp2 + var affine C.bool + switch val := pointIf.(type) { + case *P2: + x = &val.x + affine = false + case *P2Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + C.go_p2_sub_assign(p2, x, affine) + return p2 +} + +func (p2 *P2) Sub(pointIf interface{}) *P2 { + ret := *p2 + return ret.SubAssign(pointIf) +} + +func P2Generator() *P2 { + return C.blst_p2_generator() +} + +// 'acc += point * scalar', passing 'nil' for 'point' means "use the +// +// group generator point" +func (acc *P2) MultNAccumulate(pointIf interface{}, scalarIf interface{}, + optional ...int) *P2 { + var x *Fp2 + var affine C.bool + if pointIf != nil { + switch val := pointIf.(type) { + case *P2: + x = &val.x + affine = false + case *P2Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + } + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.go_p2_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) + return acc +} + +// +// Affine +// + +func (p *P2) ToAffine() *P2Affine { + var pa P2Affine + C.blst_p2_to_affine(&pa, p) + return &pa +} + +func (p *P2) FromAffine(pa *P2Affine) { + C.blst_p2_from_affine(p, pa) +} + +// Hash +func HashToG2(msg []byte, dst []byte, + optional ...[]byte) *P2 { // aug + var q P2 + + // Handle zero length message + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + var aug []byte + var augC *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + augC = (*C.byte)(&aug[0]) + } + } + + C.blst_hash_to_g2(&q, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst)), + augC, C.size_t(len(aug))) + return &q +} + +func EncodeToG2(msg []byte, dst []byte, + optional ...[]byte) *P2 { // aug + var q P2 + + // Handle zero length message + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + var aug []byte + var augC *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + augC = (*C.byte)(&aug[0]) + } + } + + C.blst_encode_to_g2(&q, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst)), + augC, C.size_t(len(aug))) + return &q +} + +// +// Multi-point/scalar operations +// + +func P2sToAffine(points []*P2, optional ...int) P2Affines { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + ret := make([]P2Affine, npoints) + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret +} + +func (points P2s) ToAffine(optional ...P2Affines) P2Affines { + npoints := len(points) + var ret P2Affines + + if len(optional) > 0 { // used in benchmark + ret = optional[0] + if len(ret) < npoints { + panic("npoints mismatch") + } + } else { + ret = make([]P2Affine, npoints) + } + + if maxProcs < 2 || npoints < 768 { + C.go_p2slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + var wg sync.WaitGroup + wg.Add(nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(out *P2Affine, inp *P2, delta int) { + C.go_p2slice_to_affine(out, inp, C.size_t(delta)) + wg.Done() + }(&ret[x], &points[x], delta) + } + wg.Wait() + + return ret +} + +// +// Batch addition +// + +func P2AffinesAdd(points []*P2Affine, optional ...int) *P2 { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + var ret P2 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_add(&ret, &points[0], C.size_t(npoints)) + return &ret +} + +func (points P2Affines) Add() *P2 { + npoints := len(points) + if maxProcs < 2 || npoints < 768 { + var ret P2 + C.go_p2slice_add(&ret, &points[0], C.size_t(npoints)) + return &ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + msgs := make(chan P2, nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(points *P2Affine, delta int) { + var ret P2 + C.go_p2slice_add(&ret, points, C.size_t(delta)) + msgs <- ret + }(&points[x], delta) + } + + ret := <-msgs + for i := 1; i < nslices; i++ { + msg := <-msgs + C.blst_p2_add_or_double(&ret, &ret, &msg) + } + return &ret +} + +func (points P2s) Add() *P2 { + return points.ToAffine().Add() +} + +// +// Multi-scalar multiplication +// + +func P2AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P2 { + var npoints int + switch val := pointsIf.(type) { + case []*P2Affine: + npoints = len(val) + case []P2Affine: + npoints = len(val) + case P2Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + nbytes := (nbits + 7) / 8 + var scalars []*C.byte + switch val := scalarsIf.(type) { + case []byte: + if len(val) < npoints*nbytes { + return nil + } + case [][]byte: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = (*C.byte)(&val[i][0]) + } + case []Scalar: + if len(val) < npoints { + return nil + } + if nbits <= 248 { + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + } + case []*Scalar: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := maxProcs + numCores := runtime.GOMAXPROCS(0) + if numCores < maxProcs { + numThreads = numCores + } + + if numThreads < 2 || npoints < 32 { + sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 + scratch := make([]uint64, sz) + + pointsBySlice := [2]*P2Affine{nil, nil} + var p_points **P2Affine + switch val := pointsIf.(type) { + case []*P2Affine: + p_points = &val[0] + case []P2Affine: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + case P2Affines: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + } + + scalarsBySlice := [2]*C.byte{nil, nil} + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[0]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[0] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = (*C.byte)(&val[0].b[0]) + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[0] + } + case []*Scalar: + p_scalars = &scalars[0] + } + + var ret P2 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_mult_pippenger(&ret, p_points, C.size_t(npoints), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0])) + + for i := range scalars { + scalars[i] = nil + } + + return &ret + } + + // this is sizeof(scratch[0]) + sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(0)) / 8 + + nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), + numThreads) + + // |grid[]| holds "coordinates" and place for result + grid := make([]struct { + x, dx, y, dy int + point P2 + }, nx*ny) + + dx := npoints / nx + y := window * (ny - 1) + total := 0 + for ; total < nx; total++ { + grid[total].x = total * dx + grid[total].dx = dx + grid[total].y = y + grid[total].dy = nbits - y + } + grid[total-1].dx = npoints - grid[total-1].x + + for y > 0 { + y -= window + for i := 0; i < nx; i++ { + grid[total].x = grid[i].x + grid[total].dx = grid[i].dx + grid[total].y = y + grid[total].dy = window + total++ + } + } + + if numThreads > total { + numThreads = total + } + + msgsCh := make(chan int, ny) + rowSync := make([]int32, ny) // count up to |nx| + curItem := int32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + scratch := make([]uint64, sz<= total { + break + } + + x := grid[workItem].x + y := grid[workItem].y + + var p_points **P2Affine + switch val := pointsIf.(type) { + case []*P2Affine: + p_points = &val[x] + case []P2Affine: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + case P2Affines: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + } + + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[x] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = (*C.byte)(&val[x].b[0]) + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[x] + } + case []*Scalar: + p_scalars = &scalars[x] + } + + C.blst_p2s_tile_pippenger(&grid[workItem].point, + p_points, C.size_t(grid[workItem].dx), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0]), + C.size_t(y), C.size_t(window)) + + if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { + msgsCh <- y // "row" is done + } else { + runtime.Gosched() // be nice to the application + } + } + + pointsBySlice[0] = nil + scalarsBySlice[0] = nil + }() + } + + var ret P2 + rows := make([]bool, ny) + row := 0 // actually index in |grid[]| + for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" + y := <-msgsCh + rows[y/window] = true // mark the "row" + for grid[row].y == y { // if it's current "row", process it + for row < total && grid[row].y == y { + C.blst_p2_add_or_double(&ret, &ret, &grid[row].point) + row++ + } + if y == 0 { + break // one can as well 'return &ret' here + } + for j := 0; j < window; j++ { + C.blst_p2_double(&ret, &ret) + } + y -= window + if !rows[y/window] { // see if next "row" was marked already + break + } + } + } + + for i := range scalars { + scalars[i] = nil + } + + return &ret +} + +func (points P2Affines) Mult(scalarsIf interface{}, nbits int) *P2 { + return P2AffinesMult(points, scalarsIf, nbits) +} + +func (points P2s) Mult(scalarsIf interface{}, nbits int) *P2 { + return points.ToAffine().Mult(scalarsIf, nbits) +} + +func parseOpts(optional ...interface{}) ([]byte, [][]byte, bool, bool) { + var aug [][]byte // For aggregate verify + var augSingle []byte // For signing + useHash := true // hash (true), encode (false) + + for _, arg := range optional { + switch v := arg.(type) { + case []byte: + augSingle = v + case [][]byte: + aug = v + case bool: + useHash = v + default: + return nil, nil, useHash, false + } + } + return augSingle, aug, useHash, true +} + +func bytesAllZero(s []byte) bool { + for _, v := range s { + if v != 0 { + return false + } + } + return true +} + +// These methods are inefficient because of cgo call overhead. For this +// reason they should be used primarily for prototyping with a goal to +// formulate interfaces that would process multiple scalars per cgo call. +func (a *Scalar) MulAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_mul_n_check(a, a, b)) +} + +func (a *Scalar) Mul(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_mul_n_check(&ret, a, b)) +} + +func (a *Scalar) AddAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_add_n_check(a, a, b)) +} + +func (a *Scalar) Add(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_add_n_check(&ret, a, b)) +} + +func (a *Scalar) SubAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_sub_n_check(a, a, b)) +} + +func (a *Scalar) Sub(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_sub_n_check(&ret, a, b)) +} + +func (a *Scalar) Inverse() *Scalar { + var ret Scalar + C.blst_sk_inverse(&ret, a) + return &ret +} + +// +// Serialization/Deserialization. +// + +// Scalar serdes +func (s *Scalar) Serialize() []byte { + var out [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&out[0]), s) + return out[:] +} + +func (s *Scalar) Deserialize(in []byte) *Scalar { + if len(in) != BLST_SCALAR_BYTES || + !C.go_scalar_from_bendian(s, (*C.byte)(&in[0])) { + return nil + } + return s +} + +func (s *Scalar) Valid() bool { + return bool(C.blst_sk_check(s)) +} + +func (s *Scalar) HashTo(msg []byte, dst []byte) bool { + ret := HashToScalar(msg, dst) + if ret != nil { + *s = *ret + return true + } + return false +} + +func HashToScalar(msg []byte, dst []byte) *Scalar { + var ret Scalar + + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + if C.go_hash_to_scalar(&ret, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst))) { + return &ret + } + + return nil +} + +// +// LEndian +// + +func (fr *Scalar) ToLEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_lendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToLEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_lendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +func (fr *Scalar) FromLEndian(arr []byte) *Scalar { + nbytes := len(arr) + if nbytes < BLST_SCALAR_BYTES || + !C.blst_scalar_from_le_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) { + return nil + } + return fr +} + +func (fp *Fp) FromLEndian(arr []byte) *Fp { + if len(arr) != BLST_FP_BYTES { + return nil + } + C.blst_fp_from_lendian(fp, (*C.byte)(&arr[0])) + return fp +} + +// +// BEndian +// + +func (fr *Scalar) ToBEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToBEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_bendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +func (fr *Scalar) FromBEndian(arr []byte) *Scalar { + nbytes := len(arr) + if nbytes < BLST_SCALAR_BYTES || + !C.blst_scalar_from_be_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) { + return nil + } + return fr +} + +func (fp *Fp) FromBEndian(arr []byte) *Fp { + if len(arr) != BLST_FP_BYTES { + return nil + } + C.blst_fp_from_bendian(fp, (*C.byte)(&arr[0])) + return fp +} + +// +// Printing +// + +func PrintBytes(val []byte, name string) { + fmt.Printf("%s = %02x\n", name, val) +} + +func (s *Scalar) Print(name string) { + arr := s.ToBEndian() + PrintBytes(arr[:], name) +} + +func (p *P1Affine) Print(name string) { + fmt.Printf("%s:\n", name) + arr := p.x.ToBEndian() + PrintBytes(arr, " x") + arr = p.y.ToBEndian() + PrintBytes(arr, " y") +} + +func (p *P1) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + aff.Print(name) +} + +func (f *Fp2) Print(name string) { + fmt.Printf("%s:\n", name) + arr := f.fp[0].ToBEndian() + PrintBytes(arr, " 0") + arr = f.fp[1].ToBEndian() + PrintBytes(arr, " 1") +} + +func (p *P2Affine) Print(name string) { + fmt.Printf("%s:\n", name) + p.x.Print(" x") + p.y.Print(" y") +} + +func (p *P2) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + aff.Print(name) +} + +// +// Equality +// + +func (s1 *Scalar) Equals(s2 *Scalar) bool { + return *s1 == *s2 +} + +func (e1 *Fp) Equals(e2 *Fp) bool { + return *e1 == *e2 +} + +func (e1 *Fp2) Equals(e2 *Fp2) bool { + return *e1 == *e2 +} + +func (e1 *P1Affine) Equals(e2 *P1Affine) bool { + return bool(C.blst_p1_affine_is_equal(e1, e2)) +} + +func (e1 *P1) Equals(e2 *P1) bool { + return bool(C.blst_p1_is_equal(e1, e2)) +} + +func (e1 *P2Affine) Equals(e2 *P2Affine) bool { + return bool(C.blst_p2_affine_is_equal(e1, e2)) +} + +func (e1 *P2) Equals(e2 *P2) bool { + return bool(C.blst_p2_is_equal(e1, e2)) +} + +// private thunk for testing + +func expandMessageXmd(msg []byte, dst []byte, len_in_bytes int) []byte { + ret := make([]byte, len_in_bytes) + + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + C.blst_expand_message_xmd((*C.byte)(&ret[0]), C.size_t(len(ret)), + msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst))) + return ret +} + +func breakdown(nbits, window, ncpus int) (int, int, int) { + var nx, ny, wnd int + + if nbits > window*ncpus { + nx = 1 + wnd = bits.Len(uint(ncpus) / 4) + if (window + wnd) > 18 { + wnd = window - wnd + } else { + wnd = (nbits/window + ncpus - 1) / ncpus + if (nbits/(window+1)+ncpus-1)/ncpus < wnd { + wnd = window + 1 + } else { + wnd = window + } + } + } else { + nx = 2 + wnd = window - 2 + for (nbits/wnd+1)*nx < ncpus { + nx += 1 + wnd = window - bits.Len(3*uint(nx)/2) + } + nx -= 1 + wnd = window - bits.Len(3*uint(nx)/2) + } + ny = nbits/wnd + 1 + wnd = nbits/ny + 1 + + return nx, ny, wnd +} + +func pippenger_window_size(npoints int) int { + wbits := bits.Len(uint(npoints)) + + if wbits > 13 { + return wbits - 4 + } + if wbits > 5 { + return wbits - 3 + } + return 2 +} diff --git a/crypto/internal/blst/blst.h b/crypto/internal/blst/blst.h new file mode 100644 index 00000000000..2e314b3a32e --- /dev/null +++ b/crypto/internal/blst/blst.h @@ -0,0 +1,483 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_H__ +#define __BLST_H__ + +#ifdef __SIZE_TYPE__ +typedef __SIZE_TYPE__ size_t; +#else +#include +#endif + +#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ + && defined(__UINT64_TYPE__) +typedef __UINT8_TYPE__ uint8_t; +typedef __UINT32_TYPE__ uint32_t; +typedef __UINT64_TYPE__ uint64_t; +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#elif defined(__BLST_CGO__) +typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ +#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 +# define bool _Bool +#else +# define bool int +#endif + +#ifdef SWIG +# define DEFNULL =NULL +#elif defined __cplusplus +# define DEFNULL =0 +#else +# define DEFNULL +#endif + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, + BLST_BAD_SCALAR, +} BLST_ERROR; + +typedef uint8_t byte; +typedef uint64_t limb_t; + +typedef struct { byte b[256/8]; } blst_scalar; +typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; +typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; +/* 0 is "real" part, 1 is "imaginary" */ +typedef struct { blst_fp fp[2]; } blst_fp2; +typedef struct { blst_fp2 fp2[3]; } blst_fp6; +typedef struct { blst_fp6 fp6[2]; } blst_fp12; + +void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); +void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); +void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); +void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); +void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); +void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); +void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); +void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); +bool blst_scalar_fr_check(const blst_scalar *a); +bool blst_sk_check(const blst_scalar *a); +bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); +bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); +bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); + +#ifndef SWIG +/* + * BLS12-381-specific Fr operations. + */ +void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); +void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sqr(blst_fr *ret, const blst_fr *a); +void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); +void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); +void blst_fr_inverse(blst_fr *ret, const blst_fr *a); +#ifdef BLST_FR_PENTAROOT +void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); +void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); +#endif + +void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); +void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); +void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); +void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); + +/* + * BLS12-381-specific Fp operations. + */ +void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); +void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); +void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); +void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sqr(blst_fp *ret, const blst_fp *a); +void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); +void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); +void blst_fp_inverse(blst_fp *ret, const blst_fp *a); +bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); + +void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); +void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); +void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); +void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); +void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); +void blst_bendian_from_fp(byte ret[48], const blst_fp *a); +void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); +void blst_lendian_from_fp(byte ret[48], const blst_fp *a); + +/* + * BLS12-381-specific Fp2 operations. + */ +void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); +void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); +void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); +bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); + +/* + * BLS12-381-specific Fp12 operations. + */ +void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); +void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, + const blst_fp6 *xy00z0); +void blst_fp12_conjugate(blst_fp12 *a); +void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); +bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); +bool blst_fp12_is_one(const blst_fp12 *a); +bool blst_fp12_in_group(const blst_fp12 *a); +const blst_fp12 *blst_fp12_one(void); +#endif // SWIG + +/* + * BLS12-381-specific point operations. + */ +typedef struct { blst_fp x, y, z; } blst_p1; +typedef struct { blst_fp x, y; } blst_p1_affine; + +void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_double(blst_p1 *out, const blst_p1 *a); +void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p1_cneg(blst_p1 *p, bool cbit); +void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); +void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); +bool blst_p1_on_curve(const blst_p1 *p); +bool blst_p1_in_g1(const blst_p1 *p); +bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); +bool blst_p1_is_inf(const blst_p1 *a); +const blst_p1 *blst_p1_generator(void); + +bool blst_p1_affine_on_curve(const blst_p1_affine *p); +bool blst_p1_affine_in_g1(const blst_p1_affine *p); +bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); +bool blst_p1_affine_is_inf(const blst_p1_affine *a); +const blst_p1_affine *blst_p1_affine_generator(void); + +typedef struct { blst_fp2 x, y, z; } blst_p2; +typedef struct { blst_fp2 x, y; } blst_p2_affine; + +void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_double(blst_p2 *out, const blst_p2 *a); +void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); +void blst_p2_cneg(blst_p2 *p, bool cbit); +void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); +void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); +bool blst_p2_on_curve(const blst_p2 *p); +bool blst_p2_in_g2(const blst_p2 *p); +bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); +bool blst_p2_is_inf(const blst_p2 *a); +const blst_p2 *blst_p2_generator(void); + +bool blst_p2_affine_on_curve(const blst_p2_affine *p); +bool blst_p2_affine_in_g2(const blst_p2_affine *p); +bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); +bool blst_p2_affine_is_inf(const blst_p2_affine *a); +const blst_p2_affine *blst_p2_affine_generator(void); + +/* + * Multi-scalar multiplications and other multi-point operations. + */ + +void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], + size_t npoints); +void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints); + +size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, + const blst_p1_affine *const points[], + size_t npoints); +size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], + size_t npoints); +void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints); + +size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, + const blst_p2_affine *const points[], + size_t npoints); +size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +/* + * Hash-to-curve operations. + */ +#ifndef SWIG +void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); +void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); +#endif + +void blst_encode_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +void blst_encode_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +/* + * Zcash-compatible serialization/deserialization. + */ +void blst_p1_serialize(byte out[96], const blst_p1 *in); +void blst_p1_compress(byte out[48], const blst_p1 *in); +void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); +void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); +BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); +BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); + +void blst_p2_serialize(byte out[192], const blst_p2 *in); +void blst_p2_compress(byte out[96], const blst_p2 *in); +void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); +void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); +BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); +BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); + +/* + * Specification defines two variants, 'minimal-signature-size' and + * 'minimal-pubkey-size'. To unify appearance we choose to distinguish + * them by suffix referring to the public key type, more specifically + * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to + * 'minimal-signature-size'. It might appear a bit counterintuitive + * in sign call, but no matter how you twist it, something is bound to + * turn a little odd. + */ +/* + * Secret-key operations. + */ +void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, + const blst_scalar *SK); +void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, + const blst_scalar *SK); + +/* + * Pairing interface. + */ +#ifndef SWIG +void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, + const blst_p1_affine *P); +void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); +void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); +void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], + const blst_p1_affine *P); +bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); +#endif + +#ifdef __BLST_CGO__ +typedef limb_t blst_pairing; +#elif defined(__BLST_RUST_BINDGEN__) +typedef struct {} blst_pairing; +#else +typedef struct blst_opaque blst_pairing; +#endif + +size_t blst_pairing_sizeof(void); +void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, + const byte *DST DEFNULL, size_t DST_len DEFNULL); +const byte *blst_pairing_get_dst(const blst_pairing *ctx); +void blst_pairing_commit(blst_pairing *ctx); +BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); +bool blst_pairing_finalverify(const blst_pairing *ctx, + const blst_fp12 *gtsig DEFNULL); + + +/* + * Customarily applications aggregate signatures separately. + * In which case application would have to pass NULLs for |signature| + * to blst_pairing_aggregate calls and pass aggregated signature + * collected with these calls to blst_pairing_finalverify. Inputs are + * Zcash-compatible "straight-from-wire" byte vectors, compressed or + * not. + */ +BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, + const byte *zwire); +BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, + const byte *zwire); + +void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); +void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); + +/* + * "One-shot" CoreVerify entry points. + */ +BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, + const blst_p2_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, + const blst_p1_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); + +extern const blst_p1_affine BLS12_381_G1; +extern const blst_p1_affine BLS12_381_NEG_G1; +extern const blst_p2_affine BLS12_381_G2; +extern const blst_p2_affine BLS12_381_NEG_G2; + +#include "blst_aux.h" + +#ifdef __cplusplus +} +#endif +#endif \ No newline at end of file diff --git a/crypto/internal/blst/blst_aux.h b/crypto/internal/blst/blst_aux.h new file mode 100644 index 00000000000..d96b1f3dd3b --- /dev/null +++ b/crypto/internal/blst/blst_aux.h @@ -0,0 +1,111 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_AUX_H__ +#define __BLST_AUX_H__ +/* + * This file lists interfaces that might be promoted to blst.h or removed, + * depending on their proven/unproven worthiness. + */ + +void blst_fr_to(blst_fr *ret, const blst_fr *a); +void blst_fr_from(blst_fr *ret, const blst_fr *a); + +void blst_fp_to(blst_fp *ret, const blst_fp *a); +void blst_fp_from(blst_fp *ret, const blst_fp *a); + +bool blst_fp_is_square(const blst_fp *a); +bool blst_fp2_is_square(const blst_fp2 *a); + +void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); +void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); + +/* + * Below functions produce both point and deserialized outcome of + * SkToPk and Sign. However, deserialized outputs are pre-decorated + * with sign and infinity bits. This means that you have to bring the + * output into compliance prior returning to application. If you want + * compressed point value, then do [equivalent of] + * + * byte temp[96]; + * blst_sk_to_pk2_in_g1(temp, out_pk, SK); + * temp[0] |= 0x80; + * memcpy(out, temp, 48); + * + * Otherwise do + * + * blst_sk_to_pk2_in_g1(out, out_pk, SK); + * out[0] &= ~0x20; + * + * Either |out| or |out_| can be NULL. + */ +void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, + const blst_p2 *hash, const blst_scalar *SK); +void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, + const blst_p1 *hash, const blst_scalar *SK); + +#ifdef __BLST_RUST_BINDGEN__ +typedef struct {} blst_uniq; +#else +typedef struct blst_opaque blst_uniq; +#endif + +size_t blst_uniq_sizeof(size_t n_nodes); +void blst_uniq_init(blst_uniq *tree); +bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +void blst_expand_message_xmd(byte *out, size_t out_len, + const byte *msg, size_t msg_len, + const byte *DST, size_t DST_len); +#endif + +void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); + +void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, + const blst_p1_affine *p); +blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); +void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a); + +void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *salt, size_t salt_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *salt, size_t salt_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_derive_master_eip2333(blst_scalar *out_SK, + const byte *IKM, size_t IKM_len); +void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK, + uint32_t child_index); + +void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex); +void blst_fr_from_hexascii(blst_fr *ret, const byte *hex); +void blst_fp_from_hexascii(blst_fp *ret, const byte *hex); + +size_t blst_p1_sizeof(void); +size_t blst_p1_affine_sizeof(void); +size_t blst_p2_sizeof(void); +size_t blst_p2_affine_sizeof(void); +size_t blst_fp12_sizeof(void); + +/* + * Single-shot SHA-256 hash function. + */ +void blst_sha256(byte out[32], const byte *msg, size_t msg_len); +#endif \ No newline at end of file From 98bb9ccfc810cedb6456db0304fb9813381c6301 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 13:21:03 -0600 Subject: [PATCH 065/200] clarify internal blst package structure in comment --- crypto/internal/blst/blst.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go index 97b9047d1e3..d9a58470d12 100644 --- a/crypto/internal/blst/blst.go +++ b/crypto/internal/blst/blst.go @@ -1,5 +1,14 @@ /* - * Copied from https://github.com/supranational/blst + * This package is equivalent to the BLST Go package including all Go exported + * functions. BLST outer Go layer is used to cross-check flow-go/crypto BLS implementation. + * Note that flow-go/crypto uses BLST internal tools only to implement protocols based on BLS12-381, + * but does not use BLST outer layer and BLS implementation. + * Ideally, the cross-check tests would import github.com/supranational/blst. However this is + * not possible in Go as it causes multiple duplicated C objects. Creating the internal blst + * package is a workaround to achieve the same purpose. Note that the internal package + * implicitly uses the C objects declared by flow-go/crypto. + * + * Copied from https://github.com/supranational/blst. * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 From d729688d5383f888d7288478a073e57adf8ba969 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 13:21:36 -0600 Subject: [PATCH 066/200] update bls cross-checks --- crypto/bls_crossBLST_test.go | 53 +++++++++++++++--------------------- crypto/go.mod | 1 + crypto/go.sum | 2 ++ 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index e9f9a902d0b..f08cc52152c 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -16,24 +16,17 @@ package crypto // both libraries might have made different choices. It is nevertheless a good flag for possible bugs or deviations // from the standard as both libraries are being developed. -/*import ( +import ( "testing" + "github.com/onflow/flow-go/crypto/internal/blst" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - blst "github.com/supranational/blst/bindings/go" "pgregory.net/rapid" - - "github.com/onflow/flow-go/crypto" -)*/ - -// TODO: this file can't compile because of duplicate C and assembly symbols (the ones used -// by the current library and the same ones used by the imported package BLST). Unfortunately, -// cgo doesn't differentiate the two symbols. These tests need to be rewritten using the internal -// BLST C exports, instead of importing the Go BLST package. +) // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library -/*func validPrivateKeyBytesFlow(t *rapid.T) []byte { +func validPrivateKeyBytesFlow(t *rapid.T) []byte { seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) sk, err := GeneratePrivateKey(BLSBLS12381, seed) // TODO: require.NoError(t, err) seems to mess with rapid @@ -56,7 +49,7 @@ func validSignatureBytesFlow(t *rapid.T) []byte { seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) sk, err := GeneratePrivateKey(BLSBLS12381, seed) require.NoError(t, err) - hasher := crypto.NewExpandMsgXOFKMAC128("random_tag") + hasher := NewExpandMsgXOFKMAC128("random_tag") message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte) signature, err := sk.Sign(message, hasher) require.NoError(t, err) @@ -89,14 +82,14 @@ func validSignatureBytesBLST(t *rapid.T) []byte { // testEncodeDecodePrivateKeyCrossBLST tests encoding and decoding of private keys are consistent with BLST. // This test assumes private key serialization is identical to the one in BLST. func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.PrKeyLenBLSBLS12381, crypto.PrKeyLenBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), prKeyLengthBLSBLS12381, prKeyLengthBLSBLS12381) validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow) validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST) // skBytes are bytes of either a valid or a random private key skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) // check decoding results are consistent - skFlow, err := crypto.DecodePrivateKey(crypto.BLSBLS12381, skBytes) + skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) var skBLST blst.Scalar res := skBLST.Deserialize(skBytes) @@ -116,14 +109,14 @@ func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { // testEncodeDecodePublicKeyCrossBLST tests encoding and decoding of public keys keys are consistent with BLST. // This test assumes public key serialization is identical to the one in BLST. func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.PubKeyLenBLSBLS12381, crypto.PubKeyLenBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), PubKeyLenBLSBLS12381, PubKeyLenBLSBLS12381) validSliceFlow := rapid.Custom(validPublicKeyBytesFlow) validSliceBLST := rapid.Custom(validPublicKeyBytesBLST) // pkBytes are bytes of either a valid or a random public key pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) // check decoding results are consistent - pkFlow, err := crypto.DecodePublicKey(crypto.BLSBLS12381, pkBytes) + pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes) var pkBLST blst.P2Affine res := pkBLST.Deserialize(pkBytes) pkValidBLST := pkBLST.KeyValidate() @@ -136,34 +129,32 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { if flowPass && blstPass { pkFlowOutBytes := pkFlow.Encode() pkBLSTOutBytes := pkBLST.Compress() - assert.Equal(t, pkFlowOutBytes, pkBLSTOutBytes) } } -// testEncodeDecodeSignatureCrossBLST tests encoding and decoding of signatures are consistent with BLST. -// This test assumes signature serialization is identical to the one in BLST. -func testEncodeDecodeSignatureCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), crypto.SignatureLenBLSBLS12381, crypto.SignatureLenBLSBLS12381) +// testEncodeDecodeG1CrossBLST tests encoding and decoding of G1 points are consistent with BLST. +// This test assumes signature serialization is identical to BLST. +func testEncodeDecodeG1CrossBLST(t *rapid.T) { + randomSlice := rapid.SliceOfN(rapid.Byte(), SignatureLenBLSBLS12381, SignatureLenBLSBLS12381) validSignatureFlow := rapid.Custom(validSignatureBytesFlow) validSignatureBLST := rapid.Custom(validSignatureBytesBLST) - // sigBytes are bytes of either a valid or a random signature + // sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte) // check decoding results are consistent var pointFlow pointE1 - // here we test readPointE1 rather than the simple Signature type alias err := readPointE1(&pointFlow, sigBytes) - flowPass := (err == nil) && (checkMembershipG1(&pointFlow) == int(valid)) + flowPass := (err == nil) && (checkMembershipG1(&pointFlow)) var pointBLST blst.P1Affine + // res is non-nil iff point is in G1 res := pointBLST.Uncompress(sigBytes) - // flow validation has no infinity rejection for G1 blstPass := (res != nil) && pointBLST.SigValidate(false) - require.Equal(t, flowPass, blstPass, "deserialization of signature %x differs", sigBytes) + require.Equal(t, flowPass, blstPass, "deserialization of G1 %x differs", sigBytes) - // check both signatures (G1 points) are equal + // check both serializations of G1 points are equal if flowPass && blstPass { sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381) writePointG1(sigFlowOutBytes, &pointFlow) @@ -187,7 +178,7 @@ func testSignHashCrossBLST(t *rapid.T) { // generate two private keys from the same seed skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte) - skFlow, err := crypto.DecodePrivateKey(crypto.BLSBLS12381, skBytes) + skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) require.NoError(t, err) var skBLST blst.Scalar res := skBLST.Deserialize(skBytes) @@ -221,10 +212,10 @@ func testKeyGenCrossBLST(t *rapid.T) { assert.Equal(t, skFlow.Encode(), skBLST.Serialize()) } -func TestAgainstBLST(t *testing.T) { +func TestCrossBLST(t *testing.T) { rapid.Check(t, testKeyGenCrossBLST) rapid.Check(t, testEncodeDecodePrivateKeyCrossBLST) rapid.Check(t, testEncodeDecodePublicKeyCrossBLST) - rapid.Check(t, testEncodeDecodeSignatureCrossBLST) + //rapid.Check(t, testEncodeDecodeG1CrossBLST) // commented till G1 check is implemented rapid.Check(t, testSignHashCrossBLST) -}*/ +} diff --git a/crypto/go.mod b/crypto/go.mod index 57c20ef9341..d10f7a17808 100644 --- a/crypto/go.mod +++ b/crypto/go.mod @@ -8,6 +8,7 @@ require ( github.com/stretchr/testify v1.8.0 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d gonum.org/v1/gonum v0.6.1 + pgregory.net/rapid v0.4.7 ) require ( diff --git a/crypto/go.sum b/crypto/go.sum index 181f9b302c0..820bb87a41c 100644 --- a/crypto/go.sum +++ b/crypto/go.sum @@ -52,4 +52,6 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= +pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= From ab2fa14494785f89875f1333312f83f68405c42f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 16:09:55 -0600 Subject: [PATCH 067/200] upgrade rapid package --- crypto/bls_crossBLST_test.go | 29 +++++++++++++++-------------- crypto/go.mod | 2 +- crypto/go.sum | 4 ++-- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index f08cc52152c..623409cd338 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -19,15 +19,16 @@ package crypto import ( "testing" - "github.com/onflow/flow-go/crypto/internal/blst" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "pgregory.net/rapid" + + "github.com/onflow/flow-go/crypto/internal/blst" ) // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library func validPrivateKeyBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) + seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed") sk, err := GeneratePrivateKey(BLSBLS12381, seed) // TODO: require.NoError(t, err) seems to mess with rapid if err != nil { @@ -38,7 +39,7 @@ func validPrivateKeyBytesFlow(t *rapid.T) []byte { // validPublicKeyBytesFlow generates bytes of a valid public key in Flow library func validPublicKeyBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) + seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed") sk, err := GeneratePrivateKey(BLSBLS12381, seed) require.NoError(t, err) return sk.PublicKey().Encode() @@ -46,11 +47,11 @@ func validPublicKeyBytesFlow(t *rapid.T) []byte { // validSignatureBytesFlow generates bytes of a valid signature in Flow library func validSignatureBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) + seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed") sk, err := GeneratePrivateKey(BLSBLS12381, seed) require.NoError(t, err) hasher := NewExpandMsgXOFKMAC128("random_tag") - message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte) + message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg") signature, err := sk.Sign(message, hasher) require.NoError(t, err) return signature @@ -59,13 +60,13 @@ func validSignatureBytesFlow(t *rapid.T) []byte { // validPrivateKeyBytesBLST generates bytes of a valid private key in BLST library func validPrivateKeyBytesBLST(t *rapid.T) []byte { randomSlice := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen) - ikm := randomSlice.Draw(t, "ikm").([]byte) + ikm := randomSlice.Draw(t, "ikm") return blst.KeyGen(ikm).Serialize() } // validPublicKeyBytesBLST generates bytes of a valid public key in BLST library func validPublicKeyBytesBLST(t *rapid.T) []byte { - ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte) + ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm") blstS := blst.KeyGen(ikm) blstG2 := new(blst.P2Affine).From(blstS) return blstG2.Compress() @@ -73,7 +74,7 @@ func validPublicKeyBytesBLST(t *rapid.T) []byte { // validSignatureBytesBLST generates bytes of a valid signature in BLST library func validSignatureBytesBLST(t *rapid.T) []byte { - ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte) + ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm") blstS := blst.KeyGen(ikm[:]) blstG1 := new(blst.P1Affine).From(blstS) return blstG1.Compress() @@ -86,7 +87,7 @@ func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow) validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST) // skBytes are bytes of either a valid or a random private key - skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) + skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example() // check decoding results are consistent skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) @@ -113,7 +114,7 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { validSliceFlow := rapid.Custom(validPublicKeyBytesFlow) validSliceBLST := rapid.Custom(validPublicKeyBytesBLST) // pkBytes are bytes of either a valid or a random public key - pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) + pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example() // check decoding results are consistent pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes) @@ -140,7 +141,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) { validSignatureFlow := rapid.Custom(validSignatureBytesFlow) validSignatureBLST := rapid.Custom(validSignatureBytesBLST) // sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes - sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte) + sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example() // check decoding results are consistent var pointFlow pointE1 @@ -176,7 +177,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) { // The test also assumes Flow signature serialization is identical to the one in BLST. func testSignHashCrossBLST(t *rapid.T) { // generate two private keys from the same seed - skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte) + skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example() skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) require.NoError(t, err) @@ -186,7 +187,7 @@ func testSignHashCrossBLST(t *rapid.T) { // generate two signatures using both libraries blsCipher := []byte("BLS_SIG_BLS12381G1_XMD:SHA-256_SSWU_RO_NUL_") - message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example().([]byte) + message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example() var sigBLST blst.P1Affine sigBLST.Sign(&skBLST, message, blsCipher) @@ -202,7 +203,7 @@ func testSignHashCrossBLST(t *rapid.T) { } func testKeyGenCrossBLST(t *rapid.T) { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) + seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed") skFlow, err := GeneratePrivateKey(BLSBLS12381, seed) if err != nil { diff --git a/crypto/go.mod b/crypto/go.mod index d10f7a17808..bb3a1561b90 100644 --- a/crypto/go.mod +++ b/crypto/go.mod @@ -8,7 +8,7 @@ require ( github.com/stretchr/testify v1.8.0 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d gonum.org/v1/gonum v0.6.1 - pgregory.net/rapid v0.4.7 + pgregory.net/rapid v0.5.7 ) require ( diff --git a/crypto/go.sum b/crypto/go.sum index 820bb87a41c..9126d59b7b2 100644 --- a/crypto/go.sum +++ b/crypto/go.sum @@ -52,6 +52,6 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= -pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU= +pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs= +pgregory.net/rapid v0.5.7/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= From 9d35d3d5f4325da523dc7e09ed4333bad8bf9477 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 18:16:49 -0600 Subject: [PATCH 068/200] fix linter false positives --- crypto/internal/blst/blst.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go index d9a58470d12..037e40d98a3 100644 --- a/crypto/internal/blst/blst.go +++ b/crypto/internal/blst/blst.go @@ -8,6 +8,8 @@ * package is a workaround to achieve the same purpose. Note that the internal package * implicitly uses the C objects declared by flow-go/crypto. * + * Note: linter staticcheck was added in two spots to avoid linter false positives. + * * Copied from https://github.com/supranational/blst. * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. @@ -587,6 +589,7 @@ func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, // main thread has completed its miller loop before // proceeding. mutex.Lock() + //nolint:staticcheck mutex.Unlock() } @@ -1205,6 +1208,7 @@ func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool, // main thread has completed its miller loop before // proceeding. mutex.Lock() + //nolint:staticcheck mutex.Unlock() } From 6031d71edf7b0e0071dddef0bf23e57c51d48b08 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 18:17:05 -0600 Subject: [PATCH 069/200] go mod tidy --- go.mod | 2 +- go.sum | 4 ++-- insecure/go.sum | 2 +- integration/go.sum | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 64ea4ffb5ae..d8bade39895 100644 --- a/go.mod +++ b/go.mod @@ -95,7 +95,7 @@ require ( google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.2.0 google.golang.org/protobuf v1.30.0 gotest.tools v2.2.0+incompatible - pgregory.net/rapid v0.4.7 + pgregory.net/rapid v0.5.7 ) require ( diff --git a/go.sum b/go.sum index 630652a0d59..b7e715a259f 100644 --- a/go.sum +++ b/go.sum @@ -2255,8 +2255,8 @@ lukechampine.com/blake3 v1.1.7 h1:GgRMhmdsuK8+ii6UZFDL8Nb+VyMwadAgcJyfYHxG6n0= lukechampine.com/blake3 v1.1.7/go.mod h1:tkKEOtDkNtklkXtLNEOGNq5tcV90tJiA1vAA12R78LA= nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k= nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0= -pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= -pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU= +pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs= +pgregory.net/rapid v0.5.7/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= diff --git a/insecure/go.sum b/insecure/go.sum index 346f2c72189..5f842bc20b9 100644 --- a/insecure/go.sum +++ b/insecure/go.sum @@ -2100,7 +2100,7 @@ lukechampine.com/blake3 v1.1.7 h1:GgRMhmdsuK8+ii6UZFDL8Nb+VyMwadAgcJyfYHxG6n0= lukechampine.com/blake3 v1.1.7/go.mod h1:tkKEOtDkNtklkXtLNEOGNq5tcV90tJiA1vAA12R78LA= nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k= nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0= -pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= +pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= diff --git a/integration/go.sum b/integration/go.sum index a31e392c4c7..64e4d983caf 100644 --- a/integration/go.sum +++ b/integration/go.sum @@ -2338,7 +2338,7 @@ modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds= modernc.org/memory v1.5.0/go.mod h1:PkUhL0Mugw21sHPeskwZW4D6VscE/GQJOnIpCnW6pSU= modernc.org/sqlite v1.21.1 h1:GyDFqNnESLOhwwDRaHGdp2jKLDzpyT/rNLglX3ZkMSU= modernc.org/sqlite v1.21.1/go.mod h1:XwQ0wZPIh1iKb5mkvCJ3szzbhk+tykC8ZWqTRTgYRwI= -pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= +pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= From 6b5fd8fffe0d6e8ed8a4b5ca3e305e33a438e716 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 18:49:33 -0600 Subject: [PATCH 070/200] Revert "upgrade rapid package" This reverts commit ab2fa14494785f89875f1333312f83f68405c42f. --- crypto/bls_crossBLST_test.go | 29 ++++++++++++++--------------- crypto/go.mod | 2 +- crypto/go.sum | 4 ++-- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index 623409cd338..f08cc52152c 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -19,16 +19,15 @@ package crypto import ( "testing" + "github.com/onflow/flow-go/crypto/internal/blst" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "pgregory.net/rapid" - - "github.com/onflow/flow-go/crypto/internal/blst" ) // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library func validPrivateKeyBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed") + seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) sk, err := GeneratePrivateKey(BLSBLS12381, seed) // TODO: require.NoError(t, err) seems to mess with rapid if err != nil { @@ -39,7 +38,7 @@ func validPrivateKeyBytesFlow(t *rapid.T) []byte { // validPublicKeyBytesFlow generates bytes of a valid public key in Flow library func validPublicKeyBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed") + seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) sk, err := GeneratePrivateKey(BLSBLS12381, seed) require.NoError(t, err) return sk.PublicKey().Encode() @@ -47,11 +46,11 @@ func validPublicKeyBytesFlow(t *rapid.T) []byte { // validSignatureBytesFlow generates bytes of a valid signature in Flow library func validSignatureBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed") + seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) sk, err := GeneratePrivateKey(BLSBLS12381, seed) require.NoError(t, err) hasher := NewExpandMsgXOFKMAC128("random_tag") - message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg") + message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte) signature, err := sk.Sign(message, hasher) require.NoError(t, err) return signature @@ -60,13 +59,13 @@ func validSignatureBytesFlow(t *rapid.T) []byte { // validPrivateKeyBytesBLST generates bytes of a valid private key in BLST library func validPrivateKeyBytesBLST(t *rapid.T) []byte { randomSlice := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen) - ikm := randomSlice.Draw(t, "ikm") + ikm := randomSlice.Draw(t, "ikm").([]byte) return blst.KeyGen(ikm).Serialize() } // validPublicKeyBytesBLST generates bytes of a valid public key in BLST library func validPublicKeyBytesBLST(t *rapid.T) []byte { - ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm") + ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte) blstS := blst.KeyGen(ikm) blstG2 := new(blst.P2Affine).From(blstS) return blstG2.Compress() @@ -74,7 +73,7 @@ func validPublicKeyBytesBLST(t *rapid.T) []byte { // validSignatureBytesBLST generates bytes of a valid signature in BLST library func validSignatureBytesBLST(t *rapid.T) []byte { - ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm") + ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte) blstS := blst.KeyGen(ikm[:]) blstG1 := new(blst.P1Affine).From(blstS) return blstG1.Compress() @@ -87,7 +86,7 @@ func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow) validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST) // skBytes are bytes of either a valid or a random private key - skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example() + skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) // check decoding results are consistent skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) @@ -114,7 +113,7 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { validSliceFlow := rapid.Custom(validPublicKeyBytesFlow) validSliceBLST := rapid.Custom(validPublicKeyBytesBLST) // pkBytes are bytes of either a valid or a random public key - pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example() + pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) // check decoding results are consistent pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes) @@ -141,7 +140,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) { validSignatureFlow := rapid.Custom(validSignatureBytesFlow) validSignatureBLST := rapid.Custom(validSignatureBytesBLST) // sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes - sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example() + sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte) // check decoding results are consistent var pointFlow pointE1 @@ -177,7 +176,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) { // The test also assumes Flow signature serialization is identical to the one in BLST. func testSignHashCrossBLST(t *rapid.T) { // generate two private keys from the same seed - skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example() + skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte) skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) require.NoError(t, err) @@ -187,7 +186,7 @@ func testSignHashCrossBLST(t *rapid.T) { // generate two signatures using both libraries blsCipher := []byte("BLS_SIG_BLS12381G1_XMD:SHA-256_SSWU_RO_NUL_") - message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example() + message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example().([]byte) var sigBLST blst.P1Affine sigBLST.Sign(&skBLST, message, blsCipher) @@ -203,7 +202,7 @@ func testSignHashCrossBLST(t *rapid.T) { } func testKeyGenCrossBLST(t *rapid.T) { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed") + seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) skFlow, err := GeneratePrivateKey(BLSBLS12381, seed) if err != nil { diff --git a/crypto/go.mod b/crypto/go.mod index bb3a1561b90..d10f7a17808 100644 --- a/crypto/go.mod +++ b/crypto/go.mod @@ -8,7 +8,7 @@ require ( github.com/stretchr/testify v1.8.0 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d gonum.org/v1/gonum v0.6.1 - pgregory.net/rapid v0.5.7 + pgregory.net/rapid v0.4.7 ) require ( diff --git a/crypto/go.sum b/crypto/go.sum index 9126d59b7b2..820bb87a41c 100644 --- a/crypto/go.sum +++ b/crypto/go.sum @@ -52,6 +52,6 @@ gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs= -pgregory.net/rapid v0.5.7/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= +pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= +pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= From e35860fedf113574fd043c45611225fb9dfd9ba0 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 18:50:01 -0600 Subject: [PATCH 071/200] Revert "go mod tidy" This reverts commit 6031d71edf7b0e0071dddef0bf23e57c51d48b08. --- go.mod | 2 +- go.sum | 4 ++-- insecure/go.sum | 2 +- integration/go.sum | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index d8bade39895..64ea4ffb5ae 100644 --- a/go.mod +++ b/go.mod @@ -95,7 +95,7 @@ require ( google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.2.0 google.golang.org/protobuf v1.30.0 gotest.tools v2.2.0+incompatible - pgregory.net/rapid v0.5.7 + pgregory.net/rapid v0.4.7 ) require ( diff --git a/go.sum b/go.sum index b7e715a259f..630652a0d59 100644 --- a/go.sum +++ b/go.sum @@ -2255,8 +2255,8 @@ lukechampine.com/blake3 v1.1.7 h1:GgRMhmdsuK8+ii6UZFDL8Nb+VyMwadAgcJyfYHxG6n0= lukechampine.com/blake3 v1.1.7/go.mod h1:tkKEOtDkNtklkXtLNEOGNq5tcV90tJiA1vAA12R78LA= nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k= nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0= -pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs= -pgregory.net/rapid v0.5.7/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= +pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= +pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= diff --git a/insecure/go.sum b/insecure/go.sum index 5f842bc20b9..346f2c72189 100644 --- a/insecure/go.sum +++ b/insecure/go.sum @@ -2100,7 +2100,7 @@ lukechampine.com/blake3 v1.1.7 h1:GgRMhmdsuK8+ii6UZFDL8Nb+VyMwadAgcJyfYHxG6n0= lukechampine.com/blake3 v1.1.7/go.mod h1:tkKEOtDkNtklkXtLNEOGNq5tcV90tJiA1vAA12R78LA= nhooyr.io/websocket v1.8.6 h1:s+C3xAMLwGmlI31Nyn/eAehUlZPwfYZu2JXM621Q5/k= nhooyr.io/websocket v1.8.6/go.mod h1:B70DZP8IakI65RVQ51MsWP/8jndNma26DVA/nFSCgW0= -pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs= +pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= diff --git a/integration/go.sum b/integration/go.sum index 64e4d983caf..a31e392c4c7 100644 --- a/integration/go.sum +++ b/integration/go.sum @@ -2338,7 +2338,7 @@ modernc.org/memory v1.5.0 h1:N+/8c5rE6EqugZwHii4IFsaJ7MUhoWX07J5tC/iI5Ds= modernc.org/memory v1.5.0/go.mod h1:PkUhL0Mugw21sHPeskwZW4D6VscE/GQJOnIpCnW6pSU= modernc.org/sqlite v1.21.1 h1:GyDFqNnESLOhwwDRaHGdp2jKLDzpyT/rNLglX3ZkMSU= modernc.org/sqlite v1.21.1/go.mod h1:XwQ0wZPIh1iKb5mkvCJ3szzbhk+tykC8ZWqTRTgYRwI= -pgregory.net/rapid v0.5.7 h1:p7/XbOgyFY1I/3Q12UTXfos70VZTcgc3WeoyiEru5cs= +pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= From 22b1ef86061ca290ffc11abcc5522809cce2c665 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 18:51:44 -0600 Subject: [PATCH 072/200] fix linter error --- crypto/bls_crossBLST_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index f08cc52152c..e67c3c3bc33 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -19,10 +19,11 @@ package crypto import ( "testing" - "github.com/onflow/flow-go/crypto/internal/blst" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "pgregory.net/rapid" + + "github.com/onflow/flow-go/crypto/internal/blst" ) // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library From 0290a985a92de6f147d5289546faa3b0a8ec6f04 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 8 May 2023 21:33:50 -0600 Subject: [PATCH 073/200] clean up of multiple pairing --- crypto/bls12381_utils.c | 9 --------- crypto/bls_core.c | 11 +---------- crypto/bls_include.h | 4 ---- 3 files changed, 1 insertion(+), 23 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index ccec6c78d17..cd1ebd543d1 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1073,7 +1073,6 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* ep2_copy(elemsG2[0], tmp); free(tmp); -#if DOUBLE_PAIRING // elemsG2[0] = -pk2 ep2_neg(elemsG2[0], elemsG2[0]); @@ -1085,14 +1084,6 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* // compare the result to 1 int res = fp12_cmp_dig(pair, 1); -#elif SINGLE_PAIRING - fp12_t pair1, pair2; - fp12_new(&pair1); fp12_new(&pair2); - pp_map_oatep_k12(pair1, elemsG1[0], elemsG2[0]); - pp_map_oatep_k12(pair2, elemsG1[1], elemsG2[1]); - - int res = fp12_cmp(pair1, pair2); -#endif fp12_free(&one); ep_free(elemsG1[0]); ep_free(elemsG1[1]); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 58a7287578f..815f1893375 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -84,7 +84,6 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int int ret = UNDEFINED; -#if DOUBLE_PAIRING // elemsG2[0] = -g2 ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded @@ -95,15 +94,7 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int // compare the result to 1 int res = fp12_cmp_dig(pair, 1); - -#elif SINGLE_PAIRING - fp12_t pair1, pair2; - fp12_new(&pair1); fp12_new(&pair2); - pp_map_oatep_k12(pair1, elemsG1[0], core_get()->ep2_g); - pp_map_oatep_k12(pair2, elemsG1[1], elemsG2[1]); - - int res = fp12_cmp(pair1, pair2); -#endif + if (core_get()->code == RLC_OK) { if (res == RLC_EQ) { ret = VALID; diff --git a/crypto/bls_include.h b/crypto/bls_include.h index d0f9120beb2..21a8d9fda59 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -16,10 +16,6 @@ #define SK_BITS (Fr_BITS) #define SK_LEN BITS_TO_BYTES(SK_BITS) -// Simultaneous Pairing in verification -#define DOUBLE_PAIRING 1 -#define SINGLE_PAIRING (DOUBLE_PAIRING^1) - // algorithm choice for hashing to G1 // both methods are similar implementations of the same optimized SSWU // but offer different timings. From 7f86c9492054a0411d39524f2fba6652438d4c8c Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 11 May 2023 01:06:28 -0600 Subject: [PATCH 074/200] Fp12 and multi-pairing computation --- crypto/bls12381_utils.c | 112 +++++++++++++++++++++++++++++++++++++++- crypto/bls12381_utils.h | 5 ++ crypto/blst_include.h | 4 ++ 3 files changed, 119 insertions(+), 2 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index cd1ebd543d1..8d724b28a7c 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -158,6 +158,9 @@ void Fr_set_limb(Fr* a, const limb_t l){ } void Fr_copy(Fr* res, const Fr* a) { + if ((uptr_t)a==(uptr_t)res) { + return; + } vec_copy((byte*)res, (byte*)a, sizeof(Fr)); } @@ -386,6 +389,9 @@ void Fp_set_limb(Fp* a, const limb_t l){ } void Fp_copy(Fp* res, const Fp* a) { + if ((uptr_t)a==(uptr_t)res) { + return; + } vec_copy((byte*)res, (byte*)a, sizeof(Fp)); } @@ -578,6 +584,31 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) { // ------------------- G1 utilities +// res = p +void E1_copy(E1* res, const E1* p) { + if ((uptr_t)p==(uptr_t)res) { + return; + } + vec_copy(res, p, sizeof(E1)); +} + +// check if `p` is infinity +bool_t E1_is_infty(const E1* p) { + // BLST infinity points are defined by Z=0 + return vec_is_zero(p->z, sizeof(p->z)); +} + +// converts an E1 point from Jacobian into affine coordinates (z=1) +void E1_to_affine(E1* res, const E1* p) { + // optimization in case coordinates are already affine + if (vec_is_equal(p->z, BLS12_381_pR, sizeof(p->z))) { + E1_copy(res, p); + return; + } + // convert from Jacobian + POINTonE1_from_Jacobian((POINTonE1*)res, (const POINTonE1*)p); +} + // ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form. // len is the size of the input buffer. // @@ -967,13 +998,16 @@ bool_t E2_is_equal(const E2* p1, const E2* p2) { // res = p void E2_copy(E2* res, const E2* p) { + if ((uptr_t)p==(uptr_t)res) { + return; + } vec_copy(res, p, sizeof(E2)); } // converts an E2 point from Jacobian into affine coordinates (z=1) void E2_to_affine(E2* res, const E2* p) { - // minor optimization in case coordinates are already affine - if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) { + // optimization in case coordinates are already affine + if (vec_is_equal(p->z, BLS12_381_Rx.p2, sizeof(p->z))) { E2_copy(res, p); return; } @@ -1296,6 +1330,80 @@ BLST_ERROR map_bytes_to_G2complement(E2* p, const uint8_t* bytes, int len) { return BLST_SUCCESS; } +// ------------------- Pairing utilities + +bool_t Fp12_is_one(Fp12 *a) { + return vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & + vec_is_zero(a[0][1], sizeof(a) - sizeof(a[0][0])); +} + +static void Fp12_set_one(Fp12 *a) { + vec_copy(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])); + vec_zero(a[0][1], sizeof(a) - sizeof(a[0][0])); +} + +// computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) +// by optimizing a common final exponentiation for all pairings. +// result is stored in `res`. +// It assumes `p` and `q` are correctly initialized and all +// p[i] and q[i] are respectively on G1 and G2 (it does not +// check their memberships). +void multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) { + // N_MAX is defined within BLST. It should represent a good tradeoff of the max number + // of miller loops to be batched in one call to `miller_loop_n`. + E1 p_[N_MAX]; + E2 q_[N_MAX]; + int n = 0; // the number of couples (p,q) held p_ and q_ + int init_flag = 0; + + // easier access pointers + vec384fp6* res_vec = (vec384fp6*)res; + POINTonE1_affine* p_POINT = (POINTonE1_affine*)p_; + POINTonE2_affine* q_POINT = (POINTonE2_affine*)q_; + + + for (int i=0; i 0) { + if (!init_flag) { + miller_loop_n(res_vec, q_POINT, p_POINT, n); + init_flag = 1; + } else { + vec384fp12 tmp; + miller_loop_n(tmp, q_POINT, p_POINT, n); + mul_fp12(res_vec, res_vec, tmp); + } + } + + // check if no miller loop was computed + if (!init_flag) { + Fp12_set_one(res); + } + + final_exp(res_vec, res_vec); +} + + + // This is a testing function. // It wraps a call to a Relic macro since cgo can't call macros. void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){ diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 5244e8cd16a..6df825b3f57 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -131,6 +131,7 @@ void ep_mult(ep_t, const ep_t, const Fr*); void ep_sum_vector(ep_t, ep_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); int E1_in_G1(const ep_t); +bool_t E1_is_infty(const E1*); int G1_simple_subgroup_check(const ep_t); void map_bytes_to_G1(E1*, const uint8_t*, int); void map_bytes_to_G1complement(E1*, const uint8_t*, int); @@ -157,6 +158,10 @@ bool_t E2_in_G2(const E2*); void map_bytes_to_G2(E2*, const uint8_t*, int); BLST_ERROR map_bytes_to_G2complement(E2*, const uint8_t*, int); +// pairing and Fp12 +bool_t Fp12_is_one(Fp12*); +void multi_pairing(Fp12*, const E1*, const E2*, const int); + // Utility functions ctx_t* relic_init_BLS12_381(); prec_st* init_precomputed_data_BLS12_381(); diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 64b8e4562b8..1f7b2484a3c 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -104,4 +104,8 @@ typedef vec384x Fp2; // `E2` is also used to represent all subgroup G_2 elements. typedef struct {Fp2 x,y,z;} E2; +// Fp12 is the codomain of the pairing function `e`, specifically the subgroup +// G_T of Fp12. +// Fp12 represents G_T elements and is equivalent to `vec384fp12` (used internally by BLST) +typedef vec384fp12 Fp12; #endif From 7b7e484b83e423df6879b75252a0e0892152b7c4 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 11 May 2023 18:32:49 -0600 Subject: [PATCH 075/200] use E1 blst type in Go --- crypto/bls.go | 2 +- crypto/bls12381_utils.go | 37 ++++++++++++++----------------------- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 56225332562..804c34b619c 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -550,7 +550,7 @@ func (a *blsBLS12381Algo) init() error { func mapToG1(data []byte) *pointE1 { l := len(data) var h pointE1 - C.map_to_G1((*C.ep_st)(&h), (*C.uchar)(&data[0]), (C.int)(l)) + C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) return &h } diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 0f685494d4f..df1df3e4f2b 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -41,17 +41,15 @@ import ( // Go wrappers around BLST C types // Go wrappers around Relic C types -type pointE1 C.ep_st +type pointE1 C.E1 type pointE2 C.E2 type scalar C.Fr -// BLS12-381 related lengths -var frBytesLen = int(C.get_Fr_BYTES()) - // TODO: For now scalars are represented as field elements Fr since all scalars // are less than r - check if distinguishing two types in necessary -//type pointG1_blst C.E1 -//type pointG2_blst C.E2 + +// BLS12-381 related lengths +var frBytesLen = int(C.get_Fr_BYTES()) // context required for the BLS set-up type ctx struct { @@ -94,24 +92,17 @@ func (ct *ctx) initContext() error { return nil } -// Exponentiation in G1 (scalar point multiplication) +// Scalar multiplication of a generic point `p` in G1 func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) { - C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.Fr)(expo)) + C.E1_mult((*C.E1)(res), (*C.E1)(p), (*C.Fr)(expo)) } -// This function is for TEST only -// Exponentiation of g1 in G1 +// Scalar multiplication of generator g1 in G1 func generatorScalarMultG1(res *pointE1, expo *scalar) { - C.ep_mult_gen_bench((*C.ep_st)(res), (*C.Fr)(expo)) -} - -// This function is for TEST only -// Generic Exponentiation G1 -func genericScalarMultG1(res *pointE1, expo *scalar) { - C.ep_mult_generic_bench((*C.ep_st)(res), (*C.Fr)(expo)) + C.G1_mult_gen((*C.E1)(res), (*C.Fr)(expo)) } -// Exponentiation of g2 in G2 +// Scalar multiplication of generator g2 in G2 func generatorScalarMultG2(res *pointE2, expo *scalar) { C.G2_mult_gen((*C.E2)(res), (*C.Fr)(expo)) } @@ -187,7 +178,7 @@ func writePointG2(dest []byte, a *pointE2) { // follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves func writePointG1(dest []byte, a *pointE1) { C.ep_write_bin_compact((*C.uchar)(&dest[0]), - (*C.ep_st)(a), + (*C.E1)(a), (C.int)(signatureLengthBLSBLS12381), ) } @@ -240,7 +231,7 @@ func readPointE2(a *pointE2, src []byte) error { // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. // No G1 membership check is performed. func readPointE1(a *pointE1, src []byte) error { - switch C.ep_read_bin_compact((*C.ep_st)(a), + switch C.ep_read_bin_compact((*C.E1)(a), (*C.uchar)(&src[0]), (C.int)(len(src))) { case valid: @@ -269,13 +260,13 @@ func checkMembershipG2(pt *pointE2) bool { // randPointG1 wraps a call to C since cgo can't be used in go test files. // It generates a random point in G1 and stores it in input point. func randPointG1(pt *pointE1) { - C.ep_rand_G1((*C.ep_st)(pt)) + C.ep_rand_G1((*C.E1)(pt)) } // randPointG1Complement wraps a call to C since cgo can't be used in go test files. // It generates a random point in E1\G1 and stores it in input point. func randPointG1Complement(pt *pointE1) { - C.ep_rand_G1complement((*C.ep_st)(pt)) + C.ep_rand_G1complement((*C.E1)(pt)) } */ @@ -311,7 +302,7 @@ func hashToG1Bytes(data, dst []byte) []byte { // map the hash to G1 var point pointE1 - C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) + C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) // serialize the point pointBytes := make([]byte, signatureLengthBLSBLS12381) From 7cafb2b0723462ba9034e2e9324e04e0da8aee6d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 11 May 2023 18:33:36 -0600 Subject: [PATCH 076/200] use POINTonEx_mult_glv instead of blst_sign and remove Relic's E1 mult --- crypto/bls12381_utils.c | 39 ++++++++++++++++------------------- crypto/bls12381_utils.h | 10 ++++----- crypto/bls12381_utils_test.go | 22 +++++++------------- 3 files changed, 31 insertions(+), 40 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index ccec6c78d17..f188cb0d33e 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -701,26 +701,20 @@ void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { bin[0] |= (G1_SERIALIZATION << 7); } -// Exponentiation of a generic point p in G1 -void ep_mult(ep_t res, const ep_t p, const Fr *expo) { - bn_st* tmp_expo = Fr_blst_to_relic(expo); - // Using window NAF of size 2 - ep_mul_lwnaf(res, p, tmp_expo); - free(tmp_expo); -} - -// Exponentiation of generator g1 in G1 -// These two function are here for bench purposes only -void ep_mult_gen_bench(ep_t res, const Fr* expo) { - bn_st* tmp_expo = Fr_blst_to_relic(expo); - // Using precomputed table of size 4 - ep_mul_gen(res, tmp_expo); - free(tmp_expo); +// Exponentiation of a generic point `a` in E1, res = expo.a +void E1_mult(E1* res, const E1* p, const Fr* expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE1_mult_glv((POINTonE1*)res, (POINTonE1*)p, tmp); + vec_zero(&tmp, sizeof(tmp)); } -void ep_mult_generic_bench(ep_t res, const Fr* expo) { - // generic point multiplication - ep_mult(res, &core_get()->ep_g, expo); +// Exponentiation of generator g1 of G1, res = expo.g1 +void G1_mult_gen(E1* res, const Fr* expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE1_mult_gls((POINTonE1*)res, &BLS12_381_G1, tmp); + vec_zero(&tmp, sizeof(tmp)); } // ------------------- E2 utilities @@ -996,7 +990,8 @@ static void E2_neg(E2* a) { void E2_mult(E2* res, const E2* p, const Fr* expo) { pow256 tmp; pow256_from_Fr(tmp, expo); - POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, tmp); + POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)p, tmp); + vec_zero(&tmp, sizeof(tmp)); } // Exponentiation of a generic point `a` in E2 by a byte exponent. @@ -1005,14 +1000,16 @@ void E2_mult_small_expo(E2* res, const E2* p, const byte expo) { vec_zero(&pow_expo, sizeof(pow256)); pow_expo[0] = expo; // `pow256` uses bytes little endian. // TODO: to bench against a specific version of mult with 8 bits expo - POINTonE2_sign((POINTonE2*)res, (POINTonE2*)p, pow_expo); + POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)p, pow_expo); + pow_expo[0] = 0; } // Exponentiation of generator g2 of G2, res = expo.g2 void G2_mult_gen(E2* res, const Fr* expo) { pow256 tmp; pow256_from_Fr(tmp, expo); - POINTonE2_sign((POINTonE2*)res, &BLS12_381_G2, tmp); + POINTonE2_mult_gls((POINTonE2*)res, &BLS12_381_G2, tmp); + vec_zero(&tmp, sizeof(tmp)); } // checks if input E2 point is on the subgroup G2. diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 5244e8cd16a..625e954397f 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -123,15 +123,15 @@ void Fp_mul_montg(Fp *, const Fp *, const Fp *); void Fp_squ_montg(Fp *, const Fp *); // E1 and G1 utilities +int E1_in_G1(const ep_t); +int G1_simple_subgroup_check(const ep_t); +void E1_mult(E1*, const E1*, const Fr*); +void G1_mult_gen(E1*, const Fr*); + int ep_read_bin_compact(ep_t, const byte *, const int); void ep_write_bin_compact(byte *, const ep_t, const int); -void ep_mult_gen_bench(ep_t, const Fr*); -void ep_mult_generic_bench(ep_t, const Fr*); -void ep_mult(ep_t, const ep_t, const Fr*); void ep_sum_vector(ep_t, ep_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); -int E1_in_G1(const ep_t); -int G1_simple_subgroup_check(const ep_t); void map_bytes_to_G1(E1*, const uint8_t*, int); void map_bytes_to_G1complement(E1*, const uint8_t*, int); #if (MEMBERSHIP_CHECK_G1 == BOWE) diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 563ca26811b..9a9026e4056 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -13,7 +13,7 @@ import ( ) // G1 and G2 scalar multiplication -func BenchmarkScalarMultG1G2(b *testing.B) { +func BenchmarkScalarMult(b *testing.B) { seed := make([]byte, securityBits/8) _, err := mrand.Read(seed) require.NoError(b, err) @@ -22,7 +22,9 @@ func BenchmarkScalarMultG1G2(b *testing.B) { _ = mapToFr(&expo, seed) // G1 generator multiplication - b.Run("G1 gen", func(b *testing.B) { + // Note that generator and random point multiplications + // are implemented with the same algorithm + b.Run("G1", func(b *testing.B) { var res pointE1 b.ResetTimer() for i := 0; i < b.N; i++ { @@ -31,18 +33,10 @@ func BenchmarkScalarMultG1G2(b *testing.B) { b.StopTimer() }) - // G1 base point multiplication - b.Run("G1 generic", func(b *testing.B) { - var res pointE1 - b.ResetTimer() - for i := 0; i < b.N; i++ { - genericScalarMultG1(&res, &expo) - } - b.StopTimer() - }) - - // G2 base point multiplication - b.Run("G2 gen", func(b *testing.B) { + // G2 generator multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm + b.Run("G2", func(b *testing.B) { var res pointE2 b.ResetTimer() for i := 0; i < b.N; i++ { From 109b238acb024071927fb68f6e364f7f8ac90199 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 11 May 2023 20:06:37 -0600 Subject: [PATCH 077/200] E1_read_bytes and E1_write_bytes and their tools --- crypto/bls12381_utils.c | 231 +++++++++++++++++++++++++++++++++------- crypto/bls12381_utils.h | 15 ++- crypto/bls_core.c | 5 +- 3 files changed, 204 insertions(+), 47 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index f188cb0d33e..c0087591e8f 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -375,17 +375,17 @@ bool_t map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { const Fp BLS12_381_pR = { ONE_MONT_P }; /* R mod p = (1<<384)%p */ // sets `a` to 0 -void Fp_set_zero(Fp* a){ +static void Fp_set_zero(Fp* a){ vec_zero((byte*)a, sizeof(Fp)); } // sets `a` to limb `l` -void Fp_set_limb(Fp* a, const limb_t l){ +static void Fp_set_limb(Fp* a, const limb_t l){ vec_zero((byte*)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t)); *((limb_t*)a) = l; } -void Fp_copy(Fp* res, const Fp* a) { +static void Fp_copy(Fp* res, const Fp* a) { vec_copy((byte*)res, (byte*)a, sizeof(Fp)); } @@ -393,14 +393,24 @@ static void Fp_add(Fp *res, const Fp *a, const Fp *b) { add_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P); } -void Fp_sub(Fp *res, const Fp *a, const Fp *b) { +static void Fp_sub(Fp *res, const Fp *a, const Fp *b) { sub_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P); } -void Fp_neg(Fp *res, const Fp *a) { +static void Fp_neg(Fp *res, const Fp *a) { cneg_mod_384((limb_t*)res, (limb_t*)a, 1, BLS12_381_P); } +// checks if `a` is a quadratic residue in Fp. If yes, it computes +// the square root in `res`. +// +// The boolean output is valid whether `a` is in Montgomery form or not, +// since montgomery constant `R` is a quadratic residue. +// However, the square root is valid only if `a` is in montgomery form. +static bool_t Fp_sqrt_montg(Fp *res, const Fp* a) { + return sqrt_fp((limb_t*)res, (limb_t*)a); +} + static bool check_Fp(const Fp* in) { // use same method as in BLST internal function // which seems the most efficient. The method uses the assembly-based @@ -497,37 +507,39 @@ static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) { // 1 if y > (p - 1)/2 and 0 otherwise. // y is in montgomery form static byte Fp_get_sign(const Fp* y) { - return sgn0_pty_mont_384((const limb_t*)y, BLS12_381_P, p0); + // BLST's sgn0_pty_mont_384 requires input to be in Montg form. + // The needed sign bit is on position 1 ! + return (sgn0_pty_mont_384((const limb_t*)y, BLS12_381_P, p0)>>1) & 1; } // ------------------- Fp^2 utilities // sets `a` to limb `l` -void Fp2_set_limb(Fp2* a, const limb_t l){ +static void Fp2_set_limb(Fp2* a, const limb_t l){ Fp_set_limb(&real(a), l); Fp_set_zero(&imag(a)); } -void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) { +static void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) { add_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P); } -void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) { +static void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) { sub_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P); } -void Fp2_neg(Fp2 *res, const Fp2 *a) { +static void Fp2_neg(Fp2 *res, const Fp2 *a) { cneg_mod_384(real(res), real(a), 1, BLS12_381_P); cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P); } // res = a*b in montgomery form -void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) { +static void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) { mul_mont_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P, p0); } // res = a^2 in montgomery form -void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { +static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { sqr_mont_384x((vec384*)res, (vec384*)a, BLS12_381_P, p0); } @@ -537,7 +549,7 @@ void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { // The boolean output is valid whether `a` is in Montgomery form or not, // since montgomery constant `R` is a quadratic residue. // However, the square root is valid only if `a` is in montgomery form. -static bool_t Fp2_sqrt(Fp2 *res, const Fp2* a) { +static bool_t Fp2_sqrt_montg(Fp2 *res, const Fp2* a) { return sqrt_fp2((vec384*)res, (vec384*)a); } @@ -545,6 +557,8 @@ static bool_t Fp2_sqrt(Fp2 *res, const Fp2* a) { // sign(y_0) if y_1 = 0, else sign(y_1) // y coordinates must be in montgomery form static byte Fp2_get_sign(Fp2* y) { + // BLST's sgn0_pty_mont_384x requires input to be in Montg form. + // The needed sign bit is on position 1 ! return (sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0)>>1) & 1; } @@ -578,16 +592,7 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) { // ------------------- G1 utilities -// ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form. -// len is the size of the input buffer. -// -// The resulting point is guaranteed to be on the curve E1. -// The serialization follows: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// The code is a modified version of Relic ep_read_bin -// -// It returns RLC_OK if the inputs are valid (input buffer lengths are valid and coordinates correspond -// to a point on curve) and the execution completes, and RLC_ERR otherwise. +// TODO: to delete, only used by temporary E2_blst_to_relic int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { // check the length const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); @@ -661,11 +666,7 @@ static int fp_get_sign(const fp_t y) { return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT; } -// ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form. -// len is the allocated size of the buffer bin. -// The serialization is following: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// The code is a modified version of Relic ep_write_bin +// TODO: to delete, only used by temporary E2_blst_to_relic void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); @@ -701,6 +702,158 @@ void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { bin[0] |= (G1_SERIALIZATION << 7); } +void E1_copy(E1* res, const E1* p) { + vec_copy(res, p, sizeof(E1)); +} + +// compare p to infinity +bool_t E1_is_infty(const E1* p) { + // BLST infinity points are defined by Z=0 + return vec_is_zero(p->z, sizeof(p->z)); +} + +// set p to infinity +void E1_set_infty(E1* p) { + // BLST infinity points are defined by Z=0 + vec_zero(p->z, sizeof(p->z)); +} + +// converts an E1 point from Jacobian into affine coordinates (z=1) +void E1_to_affine(E1* res, const E1* p) { + // optimize in case coordinates are already affine + if (vec_is_equal(p->z, BLS12_381_pR, Fp_BYTES)) { + E1_copy(res, p); + return; + } + // convert from Jacobian + POINTonE1_from_Jacobian((POINTonE1*)res, (const POINTonE1*)p); +} + +// checks affine point `p` is in E1 +bool_t E1_affine_on_curve(const E1* p) { + // BLST's `POINTonE1_affine_on_curve` does not include the inifity case! + return POINTonE1_affine_on_curve((POINTonE1_affine*)p) | E1_is_infty(p); +} + +// E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or uncompressed form. +// The resulting point is guaranteed to be on curve E1 (no G1 check is included). +// Expected serialization follows: +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +// +// returns: +// - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid +// - BLST_BAD_SCALAR if Fp coordinates couldn't deserialize +// - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E1 +// - BLST_SUCCESS if deserialization is valid + +// TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, +// and update logic with G2 subgroup check? +BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { + // check the length + if (len != G1_SER_BYTES) { + return BLST_BAD_ENCODING; + } + + // check the compression bit + int compressed = bin[0] >> 7; + if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { + return BLST_BAD_ENCODING; + } + + // check if the point in infinity + int is_infinity = bin[0] & 0x40; + if (is_infinity) { + // the remaining bits need to be cleared + if (bin[0] & 0x3F) { + return BLST_BAD_ENCODING; + } + for (int i=1; i> 5) & 1; + if (y_sign && (!compressed)) { + return BLST_BAD_ENCODING; + } + + // use a temporary buffer to mask the header bits and read a.x + byte temp[Fp_BYTES]; + memcpy(temp, bin, Fp_BYTES); + temp[0] &= 0x1F; // clear the header bits + BLST_ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp)); + if (ret != BLST_SUCCESS) { + return ret; + } + + // set a.z to 1 + Fp* a_z = &(a->z); + Fp_set_limb(a_z, 1); + + if (G1_SERIALIZATION == UNCOMPRESSED) { + ret = Fp_read_bytes(&(a->y), bin + Fp_BYTES, sizeof(a->y)); + if (ret != BLST_SUCCESS){ + return ret; + } + // check read point is on curve + if (!E1_affine_on_curve(a)) { + return BLST_POINT_NOT_ON_CURVE; + } + return BLST_SUCCESS; + } + + // compute the possible square root + Fp* a_x = &(a->x); + Fp_to_montg(a_x, a_x); + + Fp* a_y = &(a->y); + Fp_squ_montg(a_y, a_x); + Fp_mul_montg(a_y, a_y, a_x); + Fp_add(a_y, a_y, &B_E1); // B_E1 is already in Montg form + if (!Fp_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue + return BLST_POINT_NOT_ON_CURVE; + + // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) + if (Fp_get_sign(a_y) != y_sign) { + Fp_neg(a_y, a_y); // flip y sign if needed + } + return BLST_SUCCESS; +} + +// E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or uncompressed form. +// It assumes buffer is of length G1_SER_BYTES +// The serialization follows: +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +void E1_write_bytes(byte *bin, const E1* a) { + if (E1_is_infty(a)) { + // set the infinity bit + bin[0] = (G1_SERIALIZATION << 7) | (1 << 6); + memset(bin+1, 0, G1_SER_BYTES-1); + return; + } + E1 tmp; + E1_to_affine(&tmp, a); // TODO: implement + + Fp* t_x = &(tmp.x); + Fp_from_montg(t_x, t_x); + Fp_write_bytes(bin, t_x); + + Fp* t_y = &(tmp.y); + if (G1_SERIALIZATION == COMPRESSED) { + bin[0] |= (Fp_get_sign(t_y) << 5); + } else { + Fp_from_montg(t_y, t_y); + Fp_write_bytes(bin + Fp_BYTES, t_y); + } + + bin[0] |= (G1_SERIALIZATION << 7); +} + // Exponentiation of a generic point `a` in E1, res = expo.a void E1_mult(E1* res, const E1* p, const Fr* expo) { pow256 tmp; @@ -713,7 +866,7 @@ void E1_mult(E1* res, const E1* p, const Fr* expo) { void G1_mult_gen(E1* res, const Fr* expo) { pow256 tmp; pow256_from_Fr(tmp, expo); - POINTonE1_mult_gls((POINTonE1*)res, &BLS12_381_G1, tmp); + POINTonE1_mult_glv((POINTonE1*)res, &BLS12_381_G1, tmp); vec_zero(&tmp, sizeof(tmp)); } @@ -810,11 +963,9 @@ ep2_st* E2_blst_to_relic(const E2* x) { return out; } -// E2_read_bytes imports a point from a buffer in a compressed or uncompressed form. -// The resulting point is guaranteed to be on curve E2 (no G2 check is included) +// E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or uncompressed form. +// The resulting point is guaranteed to be on curve E2 (no G2 check is included). // -// reads a scalar in `a` and checks it is a valid Fp element (a < p). -// input is bytes-big-endian. // returns: // - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid // - BLST_BAD_SCALAR if Fp^2 coordinates couldn't deserialize @@ -848,7 +999,7 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { } } E2_set_infty(a); - return RLC_OK; + return BLST_SUCCESS; } // read the sign bit and check for consistency @@ -892,7 +1043,7 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { Fp2_squ_montg(a_y, a_x); Fp2_mul_montg(a_y, a_y, a_x); Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form - if (!Fp2_sqrt(a_y, a_y)) // check whether x^3+b is a quadratic residue + if (!Fp2_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue return BLST_POINT_NOT_ON_CURVE; // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) @@ -902,11 +1053,10 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { return BLST_SUCCESS; } -// E2_write_bytes exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form. +// E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or uncompressed form. // It assumes buffer is of length G2_SER_BYTES // The serialization follows: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// The code is a modified version of Relic ep2_write_bin +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) void E2_write_bytes(byte *bin, const E2* a) { if (E2_is_infty(a)) { // set the infinity bit @@ -948,8 +1098,7 @@ bool_t E2_is_infty(const E2* p) { // checks affine point `p` is in E2 bool_t E2_affine_on_curve(const E2* p) { - // BLST's `POINTonE2_affine_on_curve` does not include the inifity case, - // unlike what the function name means. + // BLST's `POINTonE2_affine_on_curve` does not include the infinity case! return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p); } @@ -966,7 +1115,7 @@ void E2_copy(E2* res, const E2* p) { // converts an E2 point from Jacobian into affine coordinates (z=1) void E2_to_affine(E2* res, const E2* p) { - // minor optimization in case coordinates are already affine + // optimize in case coordinates are already affine if (vec_is_equal(p->z, BLS12_381_Rx.p2, Fp2_BYTES)) { E2_copy(res, p); return; diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 625e954397f..24f43c96a26 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -123,10 +123,17 @@ void Fp_mul_montg(Fp *, const Fp *, const Fp *); void Fp_squ_montg(Fp *, const Fp *); // E1 and G1 utilities -int E1_in_G1(const ep_t); -int G1_simple_subgroup_check(const ep_t); -void E1_mult(E1*, const E1*, const Fr*); -void G1_mult_gen(E1*, const Fr*); +void E1_copy(E1*, const E1*); +void E1_set_infty(E1*); +bool_t E1_is_infty(const E1*); +void E1_to_affine(E1*, const E1*); +bool_t E1_affine_on_curve(const E1*); +bool_t E1_in_G1(const ep_t); +int G1_simple_subgroup_check(const ep_t); +void E1_mult(E1*, const E1*, const Fr*); +void G1_mult_gen(E1*, const Fr*); +BLST_ERROR E1_read_bytes(E1*, const byte *, const int); +void E1_write_bytes(byte *, const E1*); int ep_read_bin_compact(ep_t, const byte *, const int); void ep_write_bin_compact(byte *, const ep_t, const int); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 58a7287578f..5ea02115d66 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -21,7 +21,7 @@ int get_sk_len() { // Checks if input point p is in the subgroup G1. // The function assumes the input is known to be on the curve E1. -int E1_in_G1(const ep_t p){ +bool_t E1_in_G1(const ep_t p){ // TODO: to upadte /* #if MEMBERSHIP_CHECK_G1 == EXP_ORDER @@ -42,7 +42,8 @@ static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) { ep_new(p); // s = h^sk - ep_mult(p, h, sk); + //ep_mult(p, h, sk); + ep_copy(p, h); ep_write_bin_compact(s, p, SIGNATURE_LEN); ep_free(p); } From 5ae1abbf8a6a4a5697bbc17bae7f23ebf25fa103 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 11 May 2023 20:23:05 -0600 Subject: [PATCH 078/200] G1 membership check and connect E1 read/write to the Go layer --- crypto/bls.go | 2 +- crypto/bls12381_utils.c | 7 ++++++ crypto/bls12381_utils.go | 46 ++++++++++++++++++------------------ crypto/bls12381_utils.h | 2 +- crypto/bls_crossBLST_test.go | 2 +- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 804c34b619c..d814b1209c2 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -500,7 +500,7 @@ func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte { panic("library is not configured to use compressed public key serialization") } dest := make([]byte, pubKeyLengthBLSBLS12381) - writePointG2(dest, &a.point) + writePointE2(dest, &a.point) return dest } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index c0087591e8f..c9fca01df9a 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -735,6 +735,13 @@ bool_t E1_affine_on_curve(const E1* p) { return POINTonE1_affine_on_curve((POINTonE1_affine*)p) | E1_is_infty(p); } +// checks if input E1 point is on the subgroup G1. +// It assumes input `p` is on E1. +bool_t E1_in_G1(const E1* p){ + // currently uses Scott method + return POINTonE1_in_G1((const POINTonE1*)p); +} + // E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or uncompressed form. // The resulting point is guaranteed to be on curve E1 (no G1 check is included). // Expected serialization follows: diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index df1df3e4f2b..bf6e4f996d0 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -77,7 +77,7 @@ func (a *scalar) String() string { func (p *pointE2) String() string { encoding := make([]byte, pubKeyLengthBLSBLS12381) - writePointG2(encoding, p) + writePointE2(encoding, p) return fmt.Sprintf("%#x", encoding) } @@ -166,21 +166,18 @@ func writeScalar(dest []byte, x *scalar) { C.Fr_write_bytes((*C.uchar)(&dest[0]), (*C.Fr)(x)) } -// writePointG2 writes a G2 point in a slice of bytes +// writePointE2 writes a G2 point in a slice of bytes // The slice should be of size PubKeyLenBLSBLS12381 and the serialization // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func writePointG2(dest []byte, a *pointE2) { +func writePointE2(dest []byte, a *pointE2) { C.E2_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(a)) } -// writePointG1 writes a G1 point in a slice of bytes -// The slice should be of size SignatureLenBLSBLS12381 and the serialization will -// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func writePointG1(dest []byte, a *pointE1) { - C.ep_write_bin_compact((*C.uchar)(&dest[0]), - (*C.E1)(a), - (C.int)(signatureLengthBLSBLS12381), - ) +// writePointE1 writes a G1 point in a slice of bytes +// The slice should be of size SignatureLenBLSBLS12381 and the serialization +// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves +func writePointE1(dest []byte, a *pointE1) { + C.E1_write_bytes((*C.uchar)(&dest[0]), (*C.E1)(a)) } // read an Fr* element from a byte slice @@ -218,11 +215,11 @@ func readPointE2(a *pointE2, src []byte) error { case blst_valid: return nil case blst_bad_encoding, blst_bad_scalar: - return invalidInputsErrorf("input could not deserialize to a G2 point") + return invalidInputsErrorf("input could not deserialize to a E2 point") case blst_point_not_on_curve: return invalidInputsErrorf("input is not a point on curve E2") default: - return errors.New("reading a G2 point failed") + return errors.New("reading E2 point failed") } } @@ -231,23 +228,26 @@ func readPointE2(a *pointE2, src []byte) error { // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. // No G1 membership check is performed. func readPointE1(a *pointE1, src []byte) error { - switch C.ep_read_bin_compact((*C.E1)(a), + read := C.E1_read_bytes((*C.E1)(a), (*C.uchar)(&src[0]), - (C.int)(len(src))) { - case valid: + (C.int)(len(src))) + + switch int(read) { + case blst_valid: return nil - case invalid: - return invalidInputsErrorf("input is not a G1 point") + case blst_bad_encoding, blst_bad_scalar: + return invalidInputsErrorf("input could not deserialize to a E1 point") + case blst_point_not_on_curve: + return invalidInputsErrorf("input is not a point on curve E1") default: - return errors.New("reading a G1 point failed") + return errors.New("reading E1 point failed") } } // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used // in go test files. func checkMembershipG1(pt *pointE1) bool { - //return C.E1_in_G1((*C.E1)(pt)) != (C.ulonglong)(0) - return true + return C.E1_in_G1((*C.E1)(pt)) != (C.ulonglong)(0) } // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used @@ -302,10 +302,10 @@ func hashToG1Bytes(data, dst []byte) []byte { // map the hash to G1 var point pointE1 - C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) + C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) // serialize the point pointBytes := make([]byte, signatureLengthBLSBLS12381) - writePointG1(pointBytes, &point) + writePointE1(pointBytes, &point) return pointBytes } diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 24f43c96a26..50bec52f133 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -128,7 +128,7 @@ void E1_set_infty(E1*); bool_t E1_is_infty(const E1*); void E1_to_affine(E1*, const E1*); bool_t E1_affine_on_curve(const E1*); -bool_t E1_in_G1(const ep_t); +bool_t E1_in_G1(const E1*); int G1_simple_subgroup_check(const ep_t); void E1_mult(E1*, const E1*, const Fr*); void G1_mult_gen(E1*, const Fr*); diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index e67c3c3bc33..e9b1607a721 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -158,7 +158,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) { // check both serializations of G1 points are equal if flowPass && blstPass { sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381) - writePointG1(sigFlowOutBytes, &pointFlow) + writePointE1(sigFlowOutBytes, &pointFlow) sigBLSTOutBytes := pointBLST.Compress() assert.Equal(t, sigFlowOutBytes, sigBLSTOutBytes) From 8d32cffd9acdf45b4b3461e3cd3abeddab591fd8 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 12 May 2023 00:29:17 -0600 Subject: [PATCH 079/200] map to G1 using BLST and add length sanity check of input hash --- crypto/bls.go | 15 ++++++++--- crypto/bls12381_hashtocurve.c | 23 ++++++++++++---- crypto/bls12381_utils.c | 8 ++++-- crypto/bls12381_utils.go | 5 +++- crypto/bls12381_utils.h | 6 +++-- crypto/bls12381_utils_test.go | 6 +++-- crypto/bls_core.c | 51 +++++++++++++++-------------------- crypto/bls_include.h | 10 +++---- 8 files changed, 72 insertions(+), 52 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index d814b1209c2..77164298a1d 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -66,8 +66,6 @@ const ( PubKeyLenBLSBLS12381 = 2 * fieldSize * (2 - serializationG2) // the length is divided by 2 if compression is on // Hash to curve params - // expandMsgOutput is the output length of the expand_message step as required by the hash_to_curve algorithm - expandMsgOutput = 2 * (fieldSize + (securityBits / 8)) // hash to curve suite ID of the form : CurveID_ || HashID_ || MapID_ || encodingVariant_ h2cSuiteID = "BLS12381G1_XOF:KMAC128_SSWU_RO_" // scheme implemented as a countermasure for rogue attacks of the form : SchemeTag_ @@ -79,6 +77,12 @@ const ( blsPOPCipherSuite = "BLS_POP_" + h2cSuiteID + schemeTag ) +// expandMsgOutput is the output length of the expand_message step as required by the +// hash_to_curve algorithm (and the map to G1 step) +// +// (Cgo does not export C macros) +var expandMsgOutput = C.get_mapToG1_input_len() + // blsBLS12381Algo, embeds SignAlgo type blsBLS12381Algo struct { // points to Relic context of BLS12-381 with all the parameters @@ -546,11 +550,14 @@ func (a *blsBLS12381Algo) init() error { } // This is only a TEST/DEBUG/BENCH function. -// It returns the hash to G1 point from a slice of 128 bytes +// It returns the hash-to-G1 point from a slice of 128 bytes func mapToG1(data []byte) *pointE1 { l := len(data) var h pointE1 - C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) + ret := C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) + if int(ret) != valid { + return nil + } return &h } diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c index 3e8217d42e5..bff5e92f468 100644 --- a/crypto/bls12381_hashtocurve.c +++ b/crypto/bls12381_hashtocurve.c @@ -327,12 +327,25 @@ static void map_to_G1_local(ep_t p, const uint8_t *msg, int len) { } #endif -// computes a hash of input data to G1 -// construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf -void map_to_G1(ep_t h, const byte* data, const int len) { +// maps input `hash` bytes to G1. +// `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes) +// It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf +int map_to_G1(E1* h, const byte* hash, const int len) { + // sanity check of length + if (len != MAP_TO_G1_INPUT_LEN) { + return INVALID; + } + #if hashToPoint==LOCAL_SSWU map_to_G1_local(h, data, len); - #elif hashToPoint==RELIC_SSWU - ep_map_from_field(h, data, len); + + #elif hashToPoint==BLST_SSWU + // map to field elements + Fr u[2]; + map_bytes_to_Fr(&u[0], hash, MAP_TO_G1_INPUT_LEN/2); + map_bytes_to_Fr(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2); + // map field elements to G1 + map_to_g1(h, (POINTonE1 *)&u[0], (POINTonE1 *)&u[1]); #endif + return VALID; } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index c9fca01df9a..97c26b57713 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -25,6 +25,10 @@ int get_Fr_BYTES() { return Fr_BYTES; } +int get_mapToG1_input_len() { + return MAP_TO_G1_INPUT_LEN; +} + // Initializes Relic context with BLS12-381 parameters ctx_t* relic_init_BLS12_381() { @@ -411,7 +415,7 @@ static bool_t Fp_sqrt_montg(Fp *res, const Fp* a) { return sqrt_fp((limb_t*)res, (limb_t*)a); } -static bool check_Fp(const Fp* in) { +static bool Fp_check(const Fp* in) { // use same method as in BLST internal function // which seems the most efficient. The method uses the assembly-based // modular addition instead of limbs comparison @@ -453,7 +457,7 @@ BLST_ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) { } limbs_from_be_bytes((limb_t*)a, bin, Fp_BYTES); // compare read scalar to p - if (!check_Fp(a)) { + if (!Fp_check(a)) { return BLST_BAD_ENCODING; } return BLST_SUCCESS; diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index bf6e4f996d0..103577013cc 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -302,7 +302,10 @@ func hashToG1Bytes(data, dst []byte) []byte { // map the hash to G1 var point pointE1 - C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) + ret := C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) + if int(ret) != valid { + return nil + } // serialize the point pointBytes := make([]byte, signatureLengthBLSBLS12381) diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 50bec52f133..58023376c45 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -10,6 +10,7 @@ #include "relic.h" #include "blst_include.h" +#define SEC_BITS 128 #define VALID RLC_OK #define INVALID RLC_ERR #define UNDEFINED (((VALID&1)^1) | ((INVALID&2)^2)) // different value than RLC_OK and RLC_ERR @@ -21,7 +22,6 @@ #define MIN(a,b) ((a)>(b)?(b):(a)) // Fields and Group serialization lengths -#define SEC_BITS 128 #define Fp_BITS 381 #define Fp2_BYTES (2*Fp_BYTES) #define Fp_LIMBS BITS_TO_LIMBS(Fp_BITS) @@ -88,12 +88,14 @@ ep2_st* E2_blst_to_relic(const E2* x); int get_valid(); int get_invalid(); int get_Fr_BYTES(); +int get_mapToG1_input_len(); // BLS based SPoCK int bls_spock_verify(const E2*, const byte*, const E2*, const byte*); // hash to curve functions (functions in bls12381_hashtocurve.c) -void map_to_G1(ep_t, const byte*, const int); +#define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8)) +int map_to_G1(E1*, const byte*, const int); // Fr utilities extern const Fr BLS12_381_rR; diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 9a9026e4056..3fa827d2cc9 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -69,6 +69,7 @@ func TestMapToG1(t *testing.T) { for i, msg := range msgs { pointBytes := hashToG1Bytes(msg, dst) + require.NonNil(t, pointBytes) expectedPointBytes, err := hex.DecodeString(expectedPointString[i]) require.NoError(t, err) @@ -80,15 +81,16 @@ func TestMapToG1(t *testing.T) { // Hashing to G1 bench func BenchmarkMapToG1(b *testing.B) { - input := make([]byte, expandMsgOutput) for i := 0; i < len(input); i++ { input[i] = byte(i) } b.ResetTimer() + var p *pointE1 for i := 0; i < b.N; i++ { - mapToG1(input) + p = mapToG1(input) } + require.NonNil(b, p) b.StopTimer() } diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 5ea02115d66..7b3021b84a1 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -19,23 +19,6 @@ int get_sk_len() { return SK_LEN; } -// Checks if input point p is in the subgroup G1. -// The function assumes the input is known to be on the curve E1. -bool_t E1_in_G1(const ep_t p){ -// TODO: to upadte -/* - #if MEMBERSHIP_CHECK_G1 == EXP_ORDER - return G1_simple_subgroup_check(p); - #elif MEMBERSHIP_CHECK_G1 == BOWE - // section 3.2 from https://eprint.iacr.org/2019/814.pdf - return bowe_subgroup_check_G1(p); - #else - return UNDEFINED; - #endif -*/ - return VALID; -} - // Computes a BLS signature from a G1 point static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) { ep_t p; @@ -49,14 +32,19 @@ static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) { } // Computes a BLS signature from a hash -void bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { +// `data` represents the hashed message with length `len` equal to +// `MAP_TO_G1_INPUT_LEN`. +int bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { ep_t h; ep_new(h); // hash to G1 - map_to_G1(h, data, len); + if (map_to_G1(h, data, len) != VALID) { + return INVALID; + } // s = h^sk bls_sign_ep(s, sk, h); ep_free(h); + return VALID; } // Verifies a BLS signature (G1 point) against a public key (G2 point) @@ -67,23 +55,25 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int ep_t elemsG1[2]; ep2_t elemsG2[2]; - // elemsG1[0] = s ep_new(elemsG1[0]); - ep_copy(elemsG1[0], (ep_st*)s); - - // elemsG1[1] = h ep_new(elemsG1[1]); - // hash to G1 - map_to_G1(elemsG1[1], data, len); + ep2_new(elemsG2[1]); + ep2_new(&elemsG2[0]); - ep2_st* pk_tmp = E2_blst_to_relic(pk); + int ret = UNDEFINED; + + // elemsG1[0] = s + ep_copy(elemsG1[0], (ep_st*)s); // elemsG2[1] = pk - ep2_new(elemsG2[1]); + ep2_st* pk_tmp = E2_blst_to_relic(pk); ep2_copy(elemsG2[1], pk_tmp); - ep2_new(&elemsG2[0]); - int ret = UNDEFINED; + // elemsG1[1] = h + if (map_to_G1(elemsG1[1], data, len) != VALID) { + ret = INVALID; + goto out; + } #if DOUBLE_PAIRING // elemsG2[0] = -g2 @@ -321,7 +311,8 @@ int bls_verifyPerDistinctKey(const byte* sig, // Verifies a BLS signature in a byte buffer. // membership check of the signature in G1 is verified. // membership check of pk in G2 is not verified in this function. -// the membership check in G2 is separated to allow optimizing multiple verifications using the same key. +// the membership check in G2 is separated to optimize multiple verifications using the same key. +// `data` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) { ep_t s; ep_new(s); diff --git a/crypto/bls_include.h b/crypto/bls_include.h index d0f9120beb2..0da961feae2 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -21,18 +21,16 @@ #define SINGLE_PAIRING (DOUBLE_PAIRING^1) // algorithm choice for hashing to G1 -// both methods are similar implementations of the same optimized SSWU -// but offer different timings. -#define RELIC_SSWU 1 // relic library implementation -#define LOCAL_SSWU 2 // local implementation -#define hashToPoint LOCAL_SSWU +#define BLST_SSWU 1 // BLST implementation +#define LOCAL_SSWU 2 // local implementation +#define hashToPoint BLST_SSWU // bls core (functions in bls_core.c) int get_signature_len(); int get_pk_len(); int get_sk_len(); -void bls_sign(byte*, const Fr*, const byte*, const int); +int bls_sign(byte*, const Fr*, const byte*, const int); int bls_verify(const E2*, const byte*, const byte*, const int); int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, const uint32_t*, const E2*); From 9efaddb65595f0311ccda0f4ea82ee62f143a4f7 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 12 May 2023 00:40:13 -0600 Subject: [PATCH 080/200] remove local SSWU implementation and rely only on BLST map to G1 tools --- crypto/bls.go | 5 +- crypto/bls12381_hashtocurve.c | 351 ---------------------------------- crypto/bls12381_utils.c | 39 ++-- crypto/bls12381_utils.go | 3 +- crypto/bls12381_utils.h | 21 -- crypto/bls12381_utils_test.go | 4 +- crypto/bls_include.h | 5 - 7 files changed, 22 insertions(+), 406 deletions(-) delete mode 100644 crypto/bls12381_hashtocurve.c diff --git a/crypto/bls.go b/crypto/bls.go index 77164298a1d..43f42f1115d 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -81,7 +81,7 @@ const ( // hash_to_curve algorithm (and the map to G1 step) // // (Cgo does not export C macros) -var expandMsgOutput = C.get_mapToG1_input_len() +var expandMsgOutput = int(C.get_mapToG1_input_len()) // blsBLS12381Algo, embeds SignAlgo type blsBLS12381Algo struct { @@ -554,8 +554,7 @@ func (a *blsBLS12381Algo) init() error { func mapToG1(data []byte) *pointE1 { l := len(data) var h pointE1 - ret := C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) - if int(ret) != valid { + if C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) != valid { return nil } return &h diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c deleted file mode 100644 index bff5e92f468..00000000000 --- a/crypto/bls12381_hashtocurve.c +++ /dev/null @@ -1,351 +0,0 @@ -// +build relic - -#include "bls12381_utils.h" -#include "bls_include.h" - -extern prec_st* bls_prec; - -#if (hashToPoint== LOCAL_SSWU) - -// These constants are taken from https://github.com/kwantam/bls12-381_hash -// and converted to the Mongtomery domain. -// Copyright 2019 Riad S. Wahby -const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS] = { - {0x4d18b6f3af00131c, 0x19fa219793fee28c, 0x3f2885f1467f19ae, - 0x23dcea34f2ffb304, 0xd15b58d2ffc00054, 0x0913be200a20bef4,}, - {0x898985385cdbbd8b, 0x3c79e43cc7d966aa, 0x1597e193f4cd233a, - 0x8637ef1e4d6623ad, 0x11b22deed20d827b, 0x07097bc5998784ad,}, - {0xa542583a480b664b, 0xfc7169c026e568c6, 0x5ba2ef314ed8b5a6, - 0x5b5491c05102f0e7, 0xdf6e99707d2a0079, 0x0784151ed7605524,}, - {0x494e212870f72741, 0xab9be52fbda43021, 0x26f5577994e34c3d, - 0x049dfee82aefbd60, 0x65dadd7828505289, 0x0e93d431ea011aeb,}, - {0x90ee774bd6a74d45, 0x7ada1c8a41bfb185, 0x0f1a8953b325f464, - 0x104c24211be4805c, 0x169139d319ea7a8f, 0x09f20ead8e532bf6,}, - {0x6ddd93e2f43626b7, 0xa5482c9aa1ccd7bd, 0x143245631883f4bd, - 0x2e0a94ccf77ec0db, 0xb0282d480e56489f, 0x18f4bfcbb4368929,}, - {0x23c5f0c953402dfd, 0x7a43ff6958ce4fe9, 0x2c390d3d2da5df63, - 0xd0df5c98e1f9d70f, 0xffd89869a572b297, 0x1277ffc72f25e8fe,}, - {0x79f4f0490f06a8a6, 0x85f894a88030fd81, 0x12da3054b18b6410, - 0xe2a57f6505880d65, 0xbba074f260e400f1, 0x08b76279f621d028,}, - {0xe67245ba78d5b00b, 0x8456ba9a1f186475, 0x7888bff6e6b33bb4, - 0xe21585b9a30f86cb, 0x05a69cdcef55feee, 0x09e699dd9adfa5ac,}, - {0x0de5c357bff57107, 0x0a0db4ae6b1a10b2, 0xe256bb67b3b3cd8d, - 0x8ad456574e9db24f, 0x0443915f50fd4179, 0x098c4bf7de8b6375,}, - {0xe6b0617e7dd929c7, 0xfe6e37d442537375, 0x1dafdeda137a489e, - 0xe4efd1ad3f767ceb, 0x4a51d8667f0fe1cf, 0x054fdf4bbf1d821c,}, - {0x72db2a50658d767b, 0x8abf91faa257b3d5, 0xe969d6833764ab47, - 0x464170142a1009eb, 0xb14f01aadb30be2f, 0x18ae6a856f40715d,}, -}; - -const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS] = { - {0x2b567ff3e2837267, 0x1d4d9e57b958a767, 0xce028fea04bd7373, - 0xcc31a30a0b6cd3df, 0x7d7b18a682692693, 0x0d300744d42a0310,}, - {0x99c2555fa542493f, 0xfe7f53cc4874f878, 0x5df0608b8f97608a, - 0x14e03832052b49c8, 0x706326a6957dd5a4, 0x0a8dadd9c2414555,}, - {0x13d942922a5cf63a, 0x357e33e36e261e7d, 0xcf05a27c8456088d, - 0x0000bd1de7ba50f0, 0x83d0c7532f8c1fde, 0x13f70bf38bbf2905,}, - {0x5c57fd95bfafbdbb, 0x28a359a65e541707, 0x3983ceb4f6360b6d, - 0xafe19ff6f97e6d53, 0xb3468f4550192bf7, 0x0bb6cde49d8ba257,}, - {0x590b62c7ff8a513f, 0x314b4ce372cacefd, 0x6bef32ce94b8a800, - 0x6ddf84a095713d5f, 0x64eace4cb0982191, 0x0386213c651b888d,}, - {0xa5310a31111bbcdd, 0xa14ac0f5da148982, 0xf9ad9cc95423d2e9, - 0xaa6ec095283ee4a7, 0xcf5b1f022e1c9107, 0x01fddf5aed881793,}, - {0x65a572b0d7a7d950, 0xe25c2d8183473a19, 0xc2fcebe7cb877dbd, - 0x05b2d36c769a89b0, 0xba12961be86e9efb, 0x07eb1b29c1dfde1f,}, - {0x93e09572f7c4cd24, 0x364e929076795091, 0x8569467e68af51b5, - 0xa47da89439f5340f, 0xf4fa918082e44d64, 0x0ad52ba3e6695a79,}, - {0x911429844e0d5f54, 0xd03f51a3516bb233, 0x3d587e5640536e66, - 0xfa86d2a3a9a73482, 0xa90ed5adf1ed5537, 0x149c9c326a5e7393,}, - {0x462bbeb03c12921a, 0xdc9af5fa0a274a17, 0x9a558ebde836ebed, - 0x649ef8f11a4fae46, 0x8100e1652b3cdc62, 0x1862bd62c291dacb,}, - {0x05c9b8ca89f12c26, 0x0194160fa9b9ac4f, 0x6a643d5a6879fa2c, - 0x14665bdd8846e19d, 0xbb1d0d53af3ff6bf, 0x12c7e1c3b28962e5,}, - {0xb55ebf900b8a3e17, 0xfedc77ec1a9201c4, 0x1f07db10ea1a4df4, - 0x0dfbd15dc41a594d, 0x389547f2334a5391, 0x02419f98165871a4,}, - {0xb416af000745fc20, 0x8e563e9d1ea6d0f5, 0x7c763e17763a0652, - 0x01458ef0159ebbef, 0x8346fe421f96bb13, 0x0d2d7b829ce324d2,}, - {0x93096bb538d64615, 0x6f2a2619951d823a, 0x8f66b3ea59514fa4, - 0xf563e63704f7092f, 0x724b136c4cf2d9fa, 0x046959cfcfd0bf49,}, - {0xea748d4b6e405346, 0x91e9079c2c02d58f, 0x41064965946d9b59, - 0xa06731f1d2bbe1ee, 0x07f897e267a33f1b, 0x1017290919210e5f,}, - {0x872aa6c17d985097, 0xeecc53161264562a, 0x07afe37afff55002, - 0x54759078e5be6838, 0xc4b92d15db8acca8, 0x106d87d1b51d13b9,}, -}; - -// sqrt_ration optimized for p mod 4 = 3. -// Check if (U/V) is a square, return 1 if yes, 0 otherwise -// If 1 is returned, out contains sqrt(U/V), -// otherwise out is sqrt(z*U/V) -// out should not be the same as U, or V -static int sqrt_ratio_3mod4(fp_t out, const fp_t u, const fp_t v) { - fp_t t0, t1, t2; - - fp_sqr(t1, v); // V^2 - fp_mul(t2, u, v); // U*V - fp_mul(t1, t1, t2); // U*V^3 - fp_exp(out, t1, &bls_prec->p_3div4); // (U*V^3)^((p-3)/4) - fp_mul(out, out, t2); // (U*V)*(U*V^3)^((p-3)/4) = U^((p+1)/4) * V^(3p-5)/4 - - fp_sqr(t0, out); // out^2 - fp_mul(t0, t0, v); // out^2 * V - - int res = 1; - if (fp_cmp(t0, u) != RLC_EQ) { // check whether U/V is a quadratic residue - fp_mul(out, out, bls_prec->sqrt_z); // sqrt(-z)*U*V(UV^3)^((p-3)/4) - res = 0; - } - - return res; -} - -// returns 1 if input is odd and 0 if input is even -static int sign_0(const fp_t in) { -#if FP_RDC == MONTY - bn_t tmp; - fp_prime_back(tmp, in); // TODO: entire reduction may not be needed to get the parity - return bn_is_even(tmp); -#endif - return in[0]&1; -} - -// Maps the field element t to a point p in E1(Fp) where E1: y^2 = g(x) = x^3 + a1*x + b1 -// using optimized non-constant-time Simplified SWU implementation (A.B = 0) -// Outout point p is in Jacobian coordinates to avoid extra inversions. -static inline void map_to_E1_osswu(ep_t p, const fp_t t) { - fp_t t0, t1, t2, t3, t4; - - // get the isogeny map coefficients - ctx_t* ctx = core_get(); - fp_t *a1 = &ctx->ep_iso.a; - fp_t *b1 = &ctx->ep_iso.b; - fp_t *z = &ctx->ep_map_u; - - // compute numerator and denominator of X0(t) = N / D - fp_sqr(t1, t); // t^2 - fp_mul(t1, t1, *z); // z * t^2 - fp_sqr(t2, t1); // z^2 * t^4 - fp_add(t2, t2, t1); // z * t^2 + z^2 * t^4 - fp_add(t3, t2, bls_prec->r); // z * t^2 + z^2 * t^4 + 1 - fp_mul(t3, t3, *b1); // N = b * (z * t^2 + z^2 * t^4 + 1) - - if (fp_is_zero(t2)) { - fp_copy(p->z, bls_prec->a1z); // D = a * z - } else { - fp_mul(p->z, t2, bls_prec->minus_a1); // D = - a * (z * t^2 + z^2 * t^4) - } - - // compute numerator and denominator of g(X0(t)) = U / V - // U = N^3 + a1 * N * D^2 + b1 * D^3 - // V = D^3 - fp_sqr(t2, t3); // N^2 - fp_sqr(t0, p->z); // D^2 - fp_mul(t4, *a1, t0); // a * D^2 - fp_add(t2, t4, t2); // N^2 + a * D^2 - fp_mul(t2, t3, t2); // N^3 + a * N * D^2 - fp_mul(t0, t0, p->z); // V = D^3 - fp_mul(t4, *b1, t0); // b * V = b * D^3 - fp_add(t2, t4, t2); // U = N^3 + a1 * N * D^2 + b1 * D^3 - - // compute sqrt(U/V) - int is_sqr = sqrt_ratio_3mod4(p->y, t2, t0); - if (is_sqr) { - fp_copy(p->x, t3); // x = N - } else { - fp_mul(p->x, t1, t3); // x = N * z * t^2 - fp_mul(t1, t1, t); // z * t^3 - fp_mul(p->y, p->y, t1); // y = z * t^3 * sqrt(r * U/V) where r is 1 or map coefficient z - } - - // negate y to be the same sign of t - if (sign_0(t) != sign_0(p->y)) { - fp_neg(p->y, p->y); // -y - } - - // convert (x/D, y) into Jacobian (X,Y,Z) where Z=D to avoid inversion. - // Z = D, X = x/D * D^2 = x*D , Y = y*D^3 - fp_mul(p->x, p->x, p->z); // X = N*D - fp_mul(p->y, p->y, t0); // Y = y*D^3 - // p->z is already equal to D - p->coord = JACOB; -} - -// This code is taken from https://github.com/kwantam/bls12-381_hash -// and adapted to use Relic modular arithemtic. -// Copyright 2019 Riad S. Wahby -static inline void hornerPolynomial(fp_t accumulator, const fp_t x, const int start_val, const fp_t fp_tmp[]) { - for (int i = start_val; i >= 0; --i) { - fp_mul(accumulator, accumulator, x); // acc *= x - fp_add(accumulator, accumulator, fp_tmp[i]); // acc += next_val - } -} - -// This code is taken from https://github.com/kwantam/bls12-381_hash -// and adapted to use Relic modular arithemtic. -// Copyright 2019 Riad S. Wahby -static inline void compute_map_zvals(fp_t out[], const fp_t inv[], const fp_t zv[], const unsigned len) { - for (unsigned i = 0; i < len; ++i) { - fp_mul(out[i], inv[i], zv[i]); - } -} - -// 11-isogeny map -// computes the mapping of p and stores the result in r -// -// This code is taken from https://github.com/kwantam/bls12-381_hash -// and adapted to use Relic modular arithemtic. The constant tables -// iso_D and iso_N were converted to the Montgomery domain. -// -// Copyright 2019 Riad S. Wahby -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -static inline void eval_iso11(ep_t r, const ep_t p) { - fp_t fp_tmp[32]; - - // precompute even powers of Z up to Z^30 in fp_tmp[31]..fp_tmp[17] - fp_sqr(fp_tmp[31], p->z); // Z^2 - fp_sqr(fp_tmp[30], fp_tmp[31]); // Z^4 - fp_mul(fp_tmp[29], fp_tmp[30], fp_tmp[31]); // Z^6 - fp_sqr(fp_tmp[28], fp_tmp[30]); // Z^8 - fp_mul(fp_tmp[27], fp_tmp[28], fp_tmp[31]); // Z^10 - fp_sqr(fp_tmp[26], fp_tmp[29]); // Z^12 - fp_mul(fp_tmp[25], fp_tmp[26], fp_tmp[31]); // Z^14 - fp_sqr(fp_tmp[24], fp_tmp[28]); // Z^16 - fp_mul(fp_tmp[23], fp_tmp[24], fp_tmp[31]); // Z^18 - fp_sqr(fp_tmp[22], fp_tmp[27]); // Z^20 - fp_mul(fp_tmp[21], fp_tmp[22], fp_tmp[31]); // Z^22 - fp_sqr(fp_tmp[20], fp_tmp[26]); // Z^24 - fp_mul(fp_tmp[19], fp_tmp[20], fp_tmp[31]); // Z^26 - fp_sqr(fp_tmp[18], fp_tmp[25]); // Z^28 - fp_mul(fp_tmp[17], fp_tmp[18], fp_tmp[31]); // Z^30 - - // get isogeny map coefficients - iso_t iso = ep_curve_get_iso(); - // hardcode the constant to avoid warnings of gcc -Wstringop-overread - const int deg_dy = 15; // also equal to iso->deg_yd; - const int deg_dx = 10; // also equal to iso->deg_xd; - // TODO: get N coefficient from Relic and update N computations - - // y = Ny/Dy - // compute Dy - compute_map_zvals(fp_tmp, iso->yd, fp_tmp + 17, deg_dy); // k_(15-i) Z^(2i) - fp_add(fp_tmp[16], p->x, fp_tmp[deg_dy - 1]); // X + k_14 Z^2 - hornerPolynomial(fp_tmp[16], p->x, deg_dy - 2, fp_tmp); // Horner for the rest - fp_mul(fp_tmp[15], fp_tmp[16], fp_tmp[31]); // Dy * Z^2 - fp_mul(fp_tmp[15], fp_tmp[15], p->z); // Dy * Z^3 - - // compute Ny - compute_map_zvals(fp_tmp, bls_prec->iso_Ny, fp_tmp + 17, ELLP_Ny_LEN - 1); // k_(15-i) Z^(2i) - fp_mul(fp_tmp[16], p->x, bls_prec->iso_Ny[ELLP_Ny_LEN - 1]); // k_15 * X - fp_add(fp_tmp[16], fp_tmp[16], fp_tmp[ELLP_Ny_LEN - 2]); // k_15 * X + k_14 Z^2 - hornerPolynomial(fp_tmp[16], p->x, ELLP_Ny_LEN - 3, fp_tmp); // Horner for the rest - fp_mul(fp_tmp[16], fp_tmp[16], p->y); // Ny * Y - - // x = Nx/Dx - // compute Dx - compute_map_zvals(fp_tmp, iso->xd, fp_tmp + 22, deg_dx); // k_(10-i) Z^(2i) - fp_add(fp_tmp[14], p->x, fp_tmp[deg_dx - 1]); // X + k_9 Z^2 - hornerPolynomial(fp_tmp[14], p->x, deg_dx - 2, fp_tmp); // Horner for the rest - fp_mul(fp_tmp[14], fp_tmp[14], fp_tmp[31]); // Dx * Z^2 - - // compute Nx - compute_map_zvals(fp_tmp, bls_prec->iso_Nx, fp_tmp + 21, ELLP_Nx_LEN - 1); // k_(11-i) Z^(2i) - fp_mul(fp_tmp[13], p->x, bls_prec->iso_Nx[ELLP_Nx_LEN - 1]); // k_11 * X - fp_add(fp_tmp[13], fp_tmp[13], fp_tmp[ELLP_Nx_LEN - 2]); // k_11 * X + k_10 * Z^2 - hornerPolynomial(fp_tmp[13], p->x, ELLP_Nx_LEN - 3, fp_tmp); // Dy: Horner for the rest - - // compute the resulting point (Xo,Yo,Zo) - fp_mul(r->z, fp_tmp[14], fp_tmp[15]); // Zo = Dx Dy - fp_mul(r->x, fp_tmp[13], fp_tmp[15]); // Nx Dy - fp_mul(r->x, r->x, r->z); // Xo = Nx Dy Z - fp_sqr(fp_tmp[12], r->z); // Zo^2 - fp_mul(r->y, fp_tmp[16], fp_tmp[14]); // Ny Dx - fp_mul(r->y, r->y, fp_tmp[12]); // Yo = Ny Dx Zo^2 - r->coord = JACOB; -} - -// map an input point in E to a point in G1 by clearing the cofactor of G1 -static void clear_cofactor(ep_t out, const ep_t in) { - bn_t z; - bn_new(z); - fp_prime_get_par(z); - // compute 1-z - bn_neg(z, z); - bn_add_dig(z, z, 1); - ep_mul_dig(out, in, z->dp[0]); // z fits in 64 bits - bn_free(z); -} - -// construction 2 section 5 in in https://eprint.iacr.org/2019/403.pdf -// evaluate the optimized SSWU map twice, add resulting points, apply isogeny map, clear cofactor -// the result is stored in p -// msg is the input message to hash, must be at least 2*(FP_BYTES+16) = 128 bytes -static void map_to_G1_local(ep_t p, const uint8_t *msg, int len) { - RLC_TRY { - if (len < 2*(Fp_BYTES+16)) { - RLC_THROW(ERR_NO_BUFFER); - } - - fp_t t1, t2; - bn_t tmp; - bn_new(tmp); - bn_read_bin(tmp, msg, len/2); - fp_prime_conv(t1, tmp); - bn_read_bin(tmp, msg + len/2, len - len/2); - fp_prime_conv(t2, tmp); - bn_free(tmp); - - ep_t p_temp; - ep_new(p_temp); - // first mapping - map_to_E1_osswu(p_temp, t1); // map to E1 - eval_iso11(p_temp, p_temp); // map to E - - // second mapping - map_to_E1_osswu(p, t2); // map to E1 - eval_iso11(p, p); // map to E - // sum - // TODO: implement point addition in E1 and apply the isogeny map only once. - // Gives 4% improvement for map-to-curve overall - ep_add_jacob(p, p, p_temp); - - // clear the cofactor - clear_cofactor(p, p); // map to G1 - ep_free(p_temp); - } - RLC_CATCH_ANY { - RLC_THROW(ERR_CAUGHT); - } -} -#endif - -// maps input `hash` bytes to G1. -// `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes) -// It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf -int map_to_G1(E1* h, const byte* hash, const int len) { - // sanity check of length - if (len != MAP_TO_G1_INPUT_LEN) { - return INVALID; - } - - #if hashToPoint==LOCAL_SSWU - map_to_G1_local(h, data, len); - - #elif hashToPoint==BLST_SSWU - // map to field elements - Fr u[2]; - map_bytes_to_Fr(&u[0], hash, MAP_TO_G1_INPUT_LEN/2); - map_bytes_to_Fr(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2); - // map field elements to G1 - map_to_g1(h, (POINTonE1 *)&u[0], (POINTonE1 *)&u[1]); - #endif - return VALID; -} diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 97c26b57713..13b9f948bed 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -61,12 +61,6 @@ ctx_t* relic_init_BLS12_381() { prec_st bls_prec_st; prec_st* bls_prec = NULL; -// required constants for the optimized SWU hash to curve -#if (hashToPoint == LOCAL_SSWU) -extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_LIMBS]; -extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_LIMBS]; -#endif - #if (MEMBERSHIP_CHECK_G1 == BOWE) extern const uint64_t beta_data[Fp_LIMBS]; extern const uint64_t z2_1_by3_data[2]; @@ -83,27 +77,11 @@ void precomputed_data_set(const prec_st* p) { // pre-compute some data required for curve BLS12-381 prec_st* init_precomputed_data_BLS12_381() { - bls_prec = &bls_prec_st; ctx_t* ctx = core_get(); // (p-1)/2 bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2); - #if (hashToPoint == LOCAL_SSWU) - // (p-3)/4 - bn_div_dig(&bls_prec->p_3div4, &bls_prec->p_1div2, 2); - // sqrt(-z) - fp_neg(bls_prec->sqrt_z, ctx->ep_map_u); - fp_srt(bls_prec->sqrt_z, bls_prec->sqrt_z); - // -a1 and a1*z - fp_neg(bls_prec->minus_a1, ctx->ep_iso.a); - fp_mul(bls_prec->a1z, ctx->ep_iso.a, ctx->ep_map_u); - - for (int i=0; iiso_Nx[i], iso_Nx_data[i]); - for (int i=0; iiso_Ny[i], iso_Ny_data[i]); - #endif #if (MEMBERSHIP_CHECK_G1 == BOWE) bn_new(&bls_prec->beta); @@ -881,6 +859,23 @@ void G1_mult_gen(E1* res, const Fr* expo) { vec_zero(&tmp, sizeof(tmp)); } +// maps bytes input `hash` to G1. +// `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes) +// It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf +int map_to_G1(E1* h, const byte* hash, const int len) { + // sanity check of length + if (len != MAP_TO_G1_INPUT_LEN) { + return INVALID; + } + // map to field elements + Fr u[2]; + map_bytes_to_Fr(&u[0], hash, MAP_TO_G1_INPUT_LEN/2); + map_bytes_to_Fr(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2); + // map field elements to G1 + map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]); + return VALID; +} + // ------------------- E2 utilities // TODO: to delete diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 103577013cc..253d8904ca1 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -302,8 +302,7 @@ func hashToG1Bytes(data, dst []byte) []byte { // map the hash to G1 var point pointE1 - ret := C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) - if int(ret) != valid { + if C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) != valid { return nil } diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 58023376c45..cddab7d5edc 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -48,29 +48,8 @@ #define MEMBERSHIP_CHECK_G2 EXP_ORDER -// constants used in the optimized SWU hash to curve -#if (hashToPoint == LOCAL_SSWU) - #define ELLP_Nx_LEN 12 - #define ELLP_Dx_LEN 10 - #define ELLP_Ny_LEN 16 - #define ELLP_Dy_LEN 15 -#endif - - // Structure of precomputed data typedef struct prec_ { - #if (hashToPoint == LOCAL_SSWU) - // constants needed in optimized SSWU - bn_st p_3div4; - fp_st sqrt_z; - // related hardcoded constants for faster access, - // where a1 is the coefficient of isogenous curve E1 - fp_st minus_a1; - fp_st a1z; - // coefficients of the isogeny map - fp_st iso_Nx[ELLP_Nx_LEN]; - fp_st iso_Ny[ELLP_Ny_LEN]; - #endif #if (MEMBERSHIP_CHECK_G1 == BOWE) bn_st beta; bn_st z2_1_by3; diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 3fa827d2cc9..b5c142ad1bb 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -69,7 +69,7 @@ func TestMapToG1(t *testing.T) { for i, msg := range msgs { pointBytes := hashToG1Bytes(msg, dst) - require.NonNil(t, pointBytes) + require.NotNil(t, pointBytes) expectedPointBytes, err := hex.DecodeString(expectedPointString[i]) require.NoError(t, err) @@ -90,7 +90,7 @@ func BenchmarkMapToG1(b *testing.B) { for i := 0; i < b.N; i++ { p = mapToG1(input) } - require.NonNil(b, p) + require.NotNil(b, p) b.StopTimer() } diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 0da961feae2..079172aa221 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -20,11 +20,6 @@ #define DOUBLE_PAIRING 1 #define SINGLE_PAIRING (DOUBLE_PAIRING^1) -// algorithm choice for hashing to G1 -#define BLST_SSWU 1 // BLST implementation -#define LOCAL_SSWU 2 // local implementation -#define hashToPoint BLST_SSWU - // bls core (functions in bls_core.c) int get_signature_len(); int get_pk_len(); From 56081df0edffb4d11a12aad3ca7b347e3264dee1 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 12 May 2023 00:42:31 -0600 Subject: [PATCH 081/200] clean up membership check macros and delete Bowe's check code --- crypto/bls12381_utils.c | 68 ----------------------------------------- crypto/bls12381_utils.h | 13 -------- 2 files changed, 81 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 13b9f948bed..ee2b23f2085 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -61,11 +61,6 @@ ctx_t* relic_init_BLS12_381() { prec_st bls_prec_st; prec_st* bls_prec = NULL; -#if (MEMBERSHIP_CHECK_G1 == BOWE) -extern const uint64_t beta_data[Fp_LIMBS]; -extern const uint64_t z2_1_by3_data[2]; -#endif - // sets the global variable to input void precomputed_data_set(const prec_st* p) { bls_prec = (prec_st*)p; @@ -83,13 +78,6 @@ prec_st* init_precomputed_data_BLS12_381() { // (p-1)/2 bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2); - #if (MEMBERSHIP_CHECK_G1 == BOWE) - bn_new(&bls_prec->beta); - bn_read_raw(&bls_prec->beta, beta_data, Fp_LIMBS); - bn_new(&bls_prec->z2_1_by3); - bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2); - #endif - // Montgomery constant R fp_set_dig(bls_prec->r, 1); return bls_prec; @@ -1327,62 +1315,6 @@ int G1_simple_subgroup_check(const ep_t p){ return VALID; } -#if (MEMBERSHIP_CHECK_G1 == BOWE) -// beta such that beta^3 == 1 mod p -// beta is in the Montgomery form -const uint64_t beta_data[Fp_LIMBS] = { - 0xcd03c9e48671f071, 0x5dab22461fcda5d2, 0x587042afd3851b95, - 0x8eb60ebe01bacb9e, 0x03f97d6e83d050d2, 0x18f0206554638741, -}; - - -// (z^2-1)/3 with z being the parameter of bls12-381 -const uint64_t z2_1_by3_data[2] = { - 0x0000000055555555, 0x396c8c005555e156 -}; - -// uses Bowe's check from section 3.2 from https://eprint.iacr.org/2019/814.pdf -// to check whether a point on the curve E1 is in G1. -int bowe_subgroup_check_G1(const ep_t p){ - if (ep_is_infty(p) == 1) - return VALID; - fp_t b; - dv_copy(b, beta_data, Fp_LIMBS); - ep_t sigma, sigma2, p_inv; - ep_new(sigma); - ep_new(sigma2); - ep_new(p_inv); - - // si(p) - ep_copy(sigma, p); - fp_mul(sigma[0].x, sigma[0].x, b); - // -si^2(p) - ep_copy(sigma2, sigma); - fp_mul(sigma2[0].x, sigma2[0].x, b); - fp_neg(sigma2[0].y, sigma2[0].y); - ep_dbl(sigma, sigma); - // -p - ep_copy(p_inv, p); - fp_neg(p_inv[0].y, p_inv[0].y); - // (z^2-1)/3 (2*si(p) - p - si^2(p)) - si^2(p) - ep_add(sigma, sigma, p_inv); - ep_add(sigma, sigma, sigma2); - // TODO: multiplication using a chain? - ep_mul_lwnaf(sigma, sigma, &bls_prec->z2_1_by3); - ep_add(sigma, sigma, sigma2); - - ep_free(sigma2); - ep_free(p_inv); - // check result against infinity - if (!ep_is_infty(sigma)){ - ep_free(sigma); - return INVALID; - } - ep_free(sigma); - return VALID; -} -#endif - /* // maps the bytes to a point in G1 // this is a testing file only, should not be used in any protocol! diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index cddab7d5edc..12ae39db9ee 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -41,19 +41,9 @@ #define G1_SER_BYTES (G1_BYTES/(G1_SERIALIZATION+1)) #define G2_SER_BYTES (G2_BYTES/(G2_SERIALIZATION+1)) -// Subgroup membership check method -#define EXP_ORDER 0 -#define BOWE 1 -#define MEMBERSHIP_CHECK_G1 BOWE -#define MEMBERSHIP_CHECK_G2 EXP_ORDER - // Structure of precomputed data typedef struct prec_ { - #if (MEMBERSHIP_CHECK_G1 == BOWE) - bn_st beta; - bn_st z2_1_by3; - #endif // other field-related constants bn_st p_1div2; fp_t r; // Montgomery multiplication constant @@ -122,9 +112,6 @@ void ep_sum_vector(ep_t, ep_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); void map_bytes_to_G1(E1*, const uint8_t*, int); void map_bytes_to_G1complement(E1*, const uint8_t*, int); -#if (MEMBERSHIP_CHECK_G1 == BOWE) -int bowe_subgroup_check_G1(const ep_t); -#endif // E2 and G2 utilities void E2_set_infty(E2* p); From 13025e169bfd5f80ad59fcdfdf5e69c0c51f9cdd Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 12 May 2023 11:23:05 -0600 Subject: [PATCH 082/200] refactor bls_verify_ep to use hashed point --- crypto/bls_core.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 7b3021b84a1..7f9b6e508ae 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -48,10 +48,10 @@ int bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { } // Verifies a BLS signature (G1 point) against a public key (G2 point) -// and a message data. -// The signature and public key are assumed to be in G1 and G2 respectively. This +// and a message hash `h` (G1 point). +// Hash, signature and public key are assumed to be in G1, G1 and G2 respectively. This // function only checks the pairing equality. -static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int len) { +static int bls_verify_ep(const E2* pk, const ep_t s, const ep_t h) { ep_t elemsG1[2]; ep2_t elemsG2[2]; @@ -70,10 +70,7 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const byte* data, const int ep2_copy(elemsG2[1], pk_tmp); // elemsG1[1] = h - if (map_to_G1(elemsG1[1], data, len) != VALID) { - ret = INVALID; - goto out; - } + ep_copy(elemsG1[1], h); #if DOUBLE_PAIRING // elemsG2[0] = -g2 @@ -314,8 +311,8 @@ int bls_verifyPerDistinctKey(const byte* sig, // the membership check in G2 is separated to optimize multiple verifications using the same key. // `data` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) { - ep_t s; - ep_new(s); + ep_t s, h; + ep_new(s) ep_new(h); // deserialize the signature into a curve point int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN); @@ -327,8 +324,12 @@ int bls_verify(const E2* pk, const byte* sig, const byte* data, const int len) { if (E1_in_G1(s) != VALID) { return INVALID; } + + if (map_to_G1(h, data, len) != VALID) { + return INVALID; + } - return bls_verify_ep(pk, s, data, len); + return bls_verify_ep(pk, s, h); } @@ -413,10 +414,9 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) { } // verify the binary tree and fill the results using recursive batch verifications. -static void bls_batch_verify_tree(const node* root, const int len, byte* results, - const byte* data, const int data_len) { +static void bls_batch_verify_tree(const node* root, const int len, byte* results, const ep_t h) { // verify the aggregated signature against the aggregated public key. - int res = bls_verify_ep(root->pk, root->sig, data, data_len); + int res = bls_verify_ep(root->pk, root->sig, h); // if the result is valid, all the subtree signatures are valid. if (res == VALID) { @@ -436,8 +436,8 @@ static void bls_batch_verify_tree(const node* root, const int len, byte* results // use the binary tree structure to find the invalid signatures. int right_len = len/2; int left_len = len - right_len; - bls_batch_verify_tree(root->left, left_len, &results[0], data, data_len); - bls_batch_verify_tree(root->right, right_len, &results[left_len], data, data_len); + bls_batch_verify_tree(root->left, left_len, &results[0], h); + bls_batch_verify_tree(root->right, right_len, &results[left_len], h); } // Batch verifies the validity of a multiple BLS signatures of the @@ -503,11 +503,19 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, node* root = build_tree(sigs_len, &pks[0], &sigs[0]); if (!root) goto out; + ep_t h; + ep_new(h); + if (map_to_G1(h, data, data_len) != VALID) { + goto out_map; + } + // verify the binary tree and fill the results using batch verification - bls_batch_verify_tree(root, sigs_len, &results[0], data, data_len); + bls_batch_verify_tree(root, sigs_len, &results[0], h); // free the allocated tree free_tree(root); - + +out_map: + ep_free(h); out: bn_free(r); for (int i=0; i < sigs_len; i++) { From 8885e5d71d851f8750922c0a4b375a991e73e67f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 12 May 2023 13:43:07 -0600 Subject: [PATCH 083/200] add temporary E1_blst_to_relic and use E1 in all bls_core functions - but pairing --- crypto/bls.go | 2 +- crypto/bls12381_utils.c | 59 ++++++++------- crypto/bls12381_utils.h | 4 +- crypto/bls_core.c | 160 ++++++++++++++++++++-------------------- crypto/spock.go | 8 +- 5 files changed, 120 insertions(+), 113 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 43f42f1115d..a8caa047ee7 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -320,7 +320,7 @@ const invalidBLSSignatureHeader = byte(0xE0) // makes the verification fail early. The verification would return (false, nil). func BLSInvalidSignature() Signature { signature := make([]byte, SignatureLenBLSBLS12381) - signature[0] = invalidBLSSignatureHeader // invalid header as per C.ep_read_bin_compact + signature[0] = invalidBLSSignatureHeader // invalid header as per the Zcash serialization return signature } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index ee2b23f2085..0211aa6e1a5 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -87,8 +87,10 @@ prec_st* init_precomputed_data_BLS12_381() { // Montgomery constant R related to the curve order r // R mod r = (1<<256)%r -const Fr BLS12_381_rR = { TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \ - TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), }; +const Fr BLS12_381_rR = { \ + TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \ + TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \ + }; // TODO: temp utility function to delete bn_st* Fr_blst_to_relic(const Fr* x) { @@ -560,7 +562,17 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) { Fp_write_bytes(bin + Fp_BYTES, &imag(a)); } -// ------------------- G1 utilities +// ------------------- E1 utilities + +// TODO: temp utility function to delete +ep_st* E1_blst_to_relic(const E1* x) { + ep_st* out = (ep_st*)malloc(sizeof(ep_st)); + byte* data = (byte*)malloc(G1_SER_BYTES); + E1_write_bytes(data, x); + ep_read_bin_compact(out, data, G1_SER_BYTES); + free(data); + return out; +} // TODO: to delete, only used by temporary E2_blst_to_relic int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { @@ -831,6 +843,11 @@ void E1_write_bytes(byte *bin, const E1* a) { bin[0] |= (G1_SERIALIZATION << 7); } +// generic point addition that must handle doubling and points at infinity +void E1_add(E1* res, const E1* a, const E1* b) { + POINTonE1_dadd((POINTonE1*)res, (POINTonE1*)a, (POINTonE1*)b, NULL); +} + // Exponentiation of a generic point `a` in E1, res = expo.a void E1_mult(E1* res, const E1* p, const Fr* expo) { pow256 tmp; @@ -839,6 +856,14 @@ void E1_mult(E1* res, const E1* p, const Fr* expo) { vec_zero(&tmp, sizeof(tmp)); } +// computes the sum of the E1 array elements `y[i]` and writes it in `sum`. +void E1_sum_vector(E1* sum, const E1* y, const int len){ + E1_set_infty(sum); + for (int i=0; iep_r); - if (!ep_is_infty(inf)){ - ep_free(inf); - return INVALID; - } - ep_free(inf); - return VALID; -} - /* // maps the bytes to a point in G1 // this is a testing file only, should not be used in any protocol! diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 12ae39db9ee..6d091fec86b 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -100,12 +100,14 @@ bool_t E1_is_infty(const E1*); void E1_to_affine(E1*, const E1*); bool_t E1_affine_on_curve(const E1*); bool_t E1_in_G1(const E1*); -int G1_simple_subgroup_check(const ep_t); void E1_mult(E1*, const E1*, const Fr*); +void E1_add(E1*, const E1*, const E1*); +void E1_sum_vector(E1*, const E1*, const int); void G1_mult_gen(E1*, const Fr*); BLST_ERROR E1_read_bytes(E1*, const byte *, const int); void E1_write_bytes(byte *, const E1*); +ep_st* E1_blst_to_relic(const E1* x); int ep_read_bin_compact(ep_t, const byte *, const int); void ep_write_bin_compact(byte *, const ep_t, const int); void ep_sum_vector(ep_t, ep_st*, const int); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 7f9b6e508ae..f020ba968c7 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -19,31 +19,26 @@ int get_sk_len() { return SK_LEN; } -// Computes a BLS signature from a G1 point -static void bls_sign_ep(byte* s, const Fr* sk, const ep_t h) { - ep_t p; - ep_new(p); - - // s = h^sk - //ep_mult(p, h, sk); - ep_copy(p, h); - ep_write_bin_compact(s, p, SIGNATURE_LEN); - ep_free(p); +// Computes a BLS signature from a G1 point and writes it in `out`. +// `out` must be allocated properly with `G1_SER_BYTES` bytes. +static void bls_sign_ep(byte* out, const Fr* sk, const E1* h) { + // s = h^s + E1 s; + E1_mult(&s, h, sk); + E1_write_bytes(out, &s); } -// Computes a BLS signature from a hash -// `data` represents the hashed message with length `len` equal to -// `MAP_TO_G1_INPUT_LEN`. -int bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { - ep_t h; - ep_new(h); +// Computes a BLS signature from a hash and writes it in `out`. +// `hash` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. +// `out` must be allocated properly with `G1_SER_BYTES` bytes. +int bls_sign(byte* out, const Fr* sk, const byte* hash, const int len) { // hash to G1 - if (map_to_G1(h, data, len) != VALID) { + E1 h; + if (map_to_G1(&h, hash, len) != VALID) { return INVALID; } // s = h^sk - bls_sign_ep(s, sk, h); - ep_free(h); + bls_sign_ep(out, sk, &h); return VALID; } @@ -51,7 +46,7 @@ int bls_sign(byte* s, const Fr* sk, const byte* data, const int len) { // and a message hash `h` (G1 point). // Hash, signature and public key are assumed to be in G1, G1 and G2 respectively. This // function only checks the pairing equality. -static int bls_verify_ep(const E2* pk, const ep_t s, const ep_t h) { +static int bls_verify_ep(const E2* pk, const E1* s, const E1* h) { ep_t elemsG1[2]; ep2_t elemsG2[2]; @@ -63,14 +58,16 @@ static int bls_verify_ep(const E2* pk, const ep_t s, const ep_t h) { int ret = UNDEFINED; // elemsG1[0] = s - ep_copy(elemsG1[0], (ep_st*)s); + ep_st* s_tmp = E1_blst_to_relic(s); + ep_copy(elemsG1[0], s_tmp); // elemsG2[1] = pk ep2_st* pk_tmp = E2_blst_to_relic(pk); ep2_copy(elemsG2[1], pk_tmp); // elemsG1[1] = h - ep_copy(elemsG1[1], h); + ep_st* h_tmp = E1_blst_to_relic(h); + ep_copy(elemsG1[1], h_tmp); #if DOUBLE_PAIRING // elemsG2[0] = -g2 @@ -142,12 +139,16 @@ int bls_verifyPerDistinctMessage(const byte* sig, } // elemsG1[0] = sig - ret = ep_read_bin_compact(elemsG1[0], sig, SIGNATURE_LEN); - if (ret != RLC_OK) goto out; + E1 s; + if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) { + ret = INVALID; + goto out; + } // check s is in G1 - ret = E1_in_G1(elemsG1[0]); - if (ret != VALID) goto out; + if (!E1_in_G1(&s)) goto out; + ep_st* s_tmp = E1_blst_to_relic(&s); + ep_copy(elemsG1[0], s_tmp); // elemsG2[0] = -g2 ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded @@ -157,7 +158,10 @@ int bls_verifyPerDistinctMessage(const byte* sig, for (int i=1; i < nb_hashes+1; i++) { // elemsG1[i] = h // hash to G1 - map_to_G1(elemsG1[i], &hashes[offset], len_hashes[i-1]); + E1 h; + map_to_G1(&h, &hashes[offset], len_hashes[i-1]); + ep_st* h_tmp = (ep_st*) E1_blst_to_relic(&h); + ep_copy(elemsG1[i], h_tmp); offset += len_hashes[i-1]; } @@ -230,12 +234,19 @@ int bls_verifyPerDistinctKey(const byte* sig, } // elemsG1[0] = s - ret = ep_read_bin_compact(elemsG1[0], sig, SIGNATURE_LEN); - if (ret != RLC_OK) goto out; + E1 s; + if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) { + ret = INVALID; + goto out; + } // check s in G1 - ret = E1_in_G1(elemsG1[0]); - if (ret != VALID) goto out; + if (!E1_in_G1(&s)){ + ret = INVALID; + goto out; + } + ep_st* s_tmp = E1_blst_to_relic(&s); + ep_copy(elemsG1[0], s_tmp); // elemsG2[0] = -g2 ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded @@ -252,17 +263,18 @@ int bls_verifyPerDistinctKey(const byte* sig, // tmp_hashes is a temporary array of all hashes under a same key mapped to a G1 point. // tmp_hashes size is set to the maximum possible size to minimize malloc calls. int tmp_hashes_size = hashes_per_pk[0]; - for (int i=1; i tmp_hashes_size) + for (int i=1; i tmp_hashes_size) { tmp_hashes_size = hashes_per_pk[i]; - ep_st* tmp_hashes = (ep_st*)malloc(tmp_hashes_size * sizeof(ep_st)); + } + } + E1* tmp_hashes = (E1*)malloc(tmp_hashes_size * sizeof(E1)); if (!tmp_hashes) { ret = UNDEFINED; goto out; } // sum hashes under the same key - for (int i=0; ipk = (E2*)pk; - t->sig = (ep_st*)sig; + t->sig = (E1*)sig; t->right = t->left = NULL; } return t; @@ -374,7 +387,7 @@ static void free_tree(node* root) { } // builds a binary tree of aggregation of signatures and public keys recursively. -static node* build_tree(const int len, const E2* pks, const ep_st* sigs) { +static node* build_tree(const int len, const E2* pks, const E1* sigs) { // check if a leaf is reached if (len == 1) { return new_node(&pks[0], &sigs[0]); // use the first element of the arrays @@ -386,13 +399,12 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) { // create a new node with new points E2* new_pk = (E2*)malloc(sizeof(E2)); - if (!new_pk) goto error; - ep_st* new_sig = (ep_st*)malloc(sizeof(ep_st)); - if (!new_sig) goto error_sig; + if (!new_pk) {goto error;} + E1* new_sig = (E1*)malloc(sizeof(E1)); + if (!new_sig) {goto error_sig;} node* t = new_node(new_pk, new_sig); if (!t) goto error_node; - ep_new(t->sig); // build the tree in a top-down way t->left = build_tree(left_len, &pks[0], &sigs[0]); @@ -401,7 +413,7 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) { t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]); if (!t->right) { free_tree(t); goto error; } // sum the children - ep_add_jacob(t->sig, t->left->sig, t->right->sig); + E1_add(t->sig, t->left->sig, t->right->sig); E2_add(t->pk, t->left->pk, t->right->pk); return t; @@ -414,7 +426,7 @@ static node* build_tree(const int len, const E2* pks, const ep_st* sigs) { } // verify the binary tree and fill the results using recursive batch verifications. -static void bls_batch_verify_tree(const node* root, const int len, byte* results, const ep_t h) { +static void bls_batch_verify_tree(const node* root, const int len, byte* results, const E1* h) { // verify the aggregated signature against the aggregated public key. int res = bls_verify_ep(root->pk, root->sig, h); @@ -460,11 +472,8 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, // build the arrays of G1 and G2 elements to verify E2* pks = (E2*) malloc(sigs_len * sizeof(E2)); if (!pks) return; - ep_st* sigs = (ep_st*) malloc(sigs_len * sizeof(ep_st)); + E1* sigs = (E1*) malloc(sigs_len * sizeof(E1)); if (!sigs) goto out_sigs; - for (int i=0; i < sigs_len; i++) { - ep_new(sigs[i]); - } for (int i=0; i < sigs_len; i++) { // convert the signature points: @@ -472,15 +481,12 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, // the tree aggregations remain valid. // - valid points are multiplied by a random scalar (same for public keys at same index) // to make sure a signature at index (i) is verified against the public key at the same index. - int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN); - if (read_ret != RLC_OK || E1_in_G1(&sigs[i]) != VALID) { - if (read_ret == UNDEFINED) {// unexpected error case - goto out; - }; + int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN); + if (read_ret != BLST_SUCCESS || !E1_in_G1(&sigs[i])) { // set signature and key to infinity (no effect on the aggregation tree) // and set result to invalid (result won't be overwritten) E2_set_infty(&pks[i]); - ep_set_infty(&sigs[i]); + E1_set_infty(&sigs[i]); results[i] = INVALID; } else { // choose a random non-zero coefficient of at least 128 bits @@ -494,33 +500,23 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, Fr_add(&r, &r, &one); // multiply public key and signature by the same random exponent r E2_mult(&pks[i], &pks_input[i], &r); // TODO: faster version for short expos? - bn_st* tmp = Fr_blst_to_relic(&r); - ep_mul_lwnaf(&sigs[i], &sigs[i], tmp); - free(tmp); + E1_mult(&sigs[i], &sigs[i], &r); } } // build a binary tree of aggreagtions node* root = build_tree(sigs_len, &pks[0], &sigs[0]); if (!root) goto out; - ep_t h; - ep_new(h); - if (map_to_G1(h, data, data_len) != VALID) { - goto out_map; + E1 h; + if (map_to_G1(&h, data, data_len) != VALID) { + goto out; } // verify the binary tree and fill the results using batch verification - bls_batch_verify_tree(root, sigs_len, &results[0], h); + bls_batch_verify_tree(root, sigs_len, &results[0], &h); // free the allocated tree - free_tree(root); - -out_map: - ep_free(h); + free_tree(root); out: - bn_free(r); - for (int i=0; i < sigs_len; i++) { - ep_free(sigs[i]); - } free(sigs); out_sigs: free(pks); diff --git a/crypto/spock.go b/crypto/spock.go index 4fbd974c27f..dad711d9163 100644 --- a/crypto/spock.go +++ b/crypto/spock.go @@ -90,10 +90,10 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur } // verify the spock proof using the secret data - verif := C.bls_spock_verify((*C.E2)(&blsPk1.point), - (*C.uchar)(&proof1[0]), - (*C.E2)(&blsPk2.point), - (*C.uchar)(&proof2[0])) + verif := valid /*:= C.bls_spock_verify((*C.E2)(&blsPk1.point), + (*C.uchar)(&proof1[0]), + (*C.E2)(&blsPk2.point), + (*C.uchar)(&proof2[0]))*/ switch verif { case invalid: From 9e938a97725a50d67754c046af379bdb1460c796 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 12 May 2023 17:14:11 -0600 Subject: [PATCH 084/200] implement mapping to Fp to use in map_to_G1 --- crypto/bls12381_utils.c | 44 ++++++++++++++++++++++++----------- crypto/bls12381_utils.h | 12 +++++----- crypto/bls12381_utils_test.go | 5 +--- 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 0211aa6e1a5..15791a6dc56 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -87,10 +87,10 @@ prec_st* init_precomputed_data_BLS12_381() { // Montgomery constant R related to the curve order r // R mod r = (1<<256)%r -const Fr BLS12_381_rR = { \ +const Fr BLS12_381_rR = {{ \ TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \ TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \ - }; + }}; // TODO: temp utility function to delete bn_st* Fr_blst_to_relic(const Fr* x) { @@ -187,6 +187,7 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) { // if base = b*R, res = b^expo * R // In general, res = base^expo * R^(-expo+1) // `expo` is encoded as a little-endian limb_t table of length `expo_len`. +// TODO: clean up? void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) { // mask of the most significant bit const limb_t msb_mask = (limb_t)1<<((sizeof(limb_t)<<3)-1); @@ -307,6 +308,7 @@ void Fr_write_bytes(byte *bin, const Fr* a) { // maps big-endian bytes into an Fr element using modular reduction // Input is byte-big-endian, output is Fr (internally vec256) +// TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) { Fr digit, radix; @@ -336,7 +338,7 @@ static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) // Reads a scalar from an array and maps it to Fr using modular reduction. // Input is byte-big-endian as used by the external APIs. // It returns true if scalar is zero and false otherwise. -bool_t map_bytes_to_Fr(Fr* a, const uint8_t* bin, int len) { +bool_t map_bytes_to_Fr(Fr* a, const byte* bin, int len) { Fr_from_be_bytes(a, bin, len); return Fr_is_zero(a); } @@ -443,7 +445,7 @@ void Fp_write_bytes(byte *bin, const Fp* a) { // Unlike Relic's versions, the function does not reduce the read integer modulo p and does // not throw an exception for an integer larger than p. The function returns RLC_OK if the input // corresponds to a field element, and returns RLC_ERR otherwise. -static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) { +static int fp_read_bin_safe(fp_t a, const byte *bin, int len) { if (len != Fp_BYTES) { return RLC_ERR; } @@ -872,6 +874,20 @@ void G1_mult_gen(E1* res, const Fr* expo) { vec_zero(&tmp, sizeof(tmp)); } + +// Reads a scalar bytes and maps it to Fp using modular reduction. +// output is in Montgomery form. +// `len` must be less or equal to 96 bytes and must be a multiple of 8. +// This function is only used by `map_to_G1` where input is 64 bytes. +// input `len` is not checked to satisfy the conditions above. +static void map_96_bytes_to_Fp(Fp* a, const byte* bin, int len) { + vec768 tmp ; + vec_zero(&tmp, sizeof(tmp)); + limbs_from_be_bytes((limb_t*)tmp, bin, len); + redc_mont_384((limb_t*)a, tmp, BLS12_381_P, p0); // aR^(-2) + Fp_mul_montg(a, a, (Fp*)BLS12_381_RRRR); // aR +} + // maps bytes input `hash` to G1. // `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes) // It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf @@ -881,10 +897,11 @@ int map_to_G1(E1* h, const byte* hash, const int len) { return INVALID; } // map to field elements - Fr u[2]; - map_bytes_to_Fr(&u[0], hash, MAP_TO_G1_INPUT_LEN/2); - map_bytes_to_Fr(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2); + Fp u[2]; + map_96_bytes_to_Fp(&u[0], hash, MAP_TO_G1_INPUT_LEN/2); + map_96_bytes_to_Fp(&u[1], hash + MAP_TO_G1_INPUT_LEN/2, MAP_TO_G1_INPUT_LEN/2); // map field elements to G1 + // inputs must be in Montgomery form map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]); return VALID; } @@ -892,7 +909,7 @@ int map_to_G1(E1* h, const byte* hash, const int len) { // ------------------- E2 utilities // TODO: to delete -static int fp2_read_bin_safe(fp2_t a, const uint8_t *bin, int len) { +static int fp2_read_bin_safe(fp2_t a, const byte *bin, int len) { if (len != Fp2_BYTES) { return RLC_ERR; } @@ -1327,7 +1344,7 @@ int ep_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int len) { /* // maps the bytes to a point in G1 // this is a testing file only, should not be used in any protocol! -void map_bytes_to_G1(ep_t p, const uint8_t* bytes, int len) { +void map_bytes_to_G1(ep_t p, const byte* bytes, int len) { // map to Fr Fr log; map_bytes_to_Fr(&log, bytes, len); @@ -1338,7 +1355,7 @@ void map_bytes_to_G1(ep_t p, const uint8_t* bytes, int len) { // generates a point in E1\G1 and stores it in p // this is a testing file only, should not be used in any protocol! -void map_bytes_to_G1complement(ep_t p, const uint8_t* bytes, int len) { +void map_bytes_to_G1complement(ep_t p, const byte* bytes, int len) { // generate a random point in E1 p->coord = BASIC; fp_set_dig(p->z, 1); @@ -1361,7 +1378,7 @@ void map_bytes_to_G1complement(ep_t p, const uint8_t* bytes, int len) { // maps the bytes to a point in G2. // `len` should be at least Fr_BYTES. // this is a testing tool only, it should not be used in any protocol! -void map_bytes_to_G2(E2* p, const uint8_t* bytes, int len) { +void map_bytes_to_G2(E2* p, const byte* bytes, int len) { assert(len > Fr_BYTES); // map to Fr Fr log; @@ -1375,7 +1392,7 @@ void map_bytes_to_G2(E2* p, const uint8_t* bytes, int len) { // succeeds. // For now, function only works when E2 serialization is compressed. // this is a testing tool only, it should not be used in any protocol! -BLST_ERROR map_bytes_to_G2complement(E2* p, const uint8_t* bytes, int len) { +BLST_ERROR map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { assert(G2_SERIALIZATION == COMPRESSED); assert(len >= G2_SER_BYTES); @@ -1386,13 +1403,14 @@ BLST_ERROR map_bytes_to_G2complement(E2* p, const uint8_t* bytes, int len) { copy[0] |= 1<<7; // set compression bit copy[0] &= ~(1<<6); // clear infinity bit - point is not infinity - BLST_ERROR ser = E2_read_bytes(p, copy, len); + BLST_ERROR ser = E2_read_bytes(p, copy, G2_SER_BYTES); if (ser != BLST_SUCCESS) { return ser; } // map the point to E2\G2 by clearing G2 order E2_mult(p, p, (const Fr*)BLS12_381_r); + E2_to_affine(p, p); assert(E2_affine_on_curve(p)); // sanity check to make sure p is in E2 return BLST_SUCCESS; diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 6d091fec86b..e3e845f0c19 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -90,8 +90,8 @@ void Fr_write_bytes(byte *bin, const Fr* a); bool_t map_bytes_to_Fr(Fr*, const byte*, int); // Fp utilities -void Fp_mul_montg(Fp *, const Fp *, const Fp *); -void Fp_squ_montg(Fp *, const Fp *); +void Fp_mul_montg(Fp *, const Fp *, const Fp *); +void Fp_squ_montg(Fp *, const Fp *); // E1 and G1 utilities void E1_copy(E1*, const E1*); @@ -112,8 +112,8 @@ int ep_read_bin_compact(ep_t, const byte *, const int); void ep_write_bin_compact(byte *, const ep_t, const int); void ep_sum_vector(ep_t, ep_st*, const int); int ep_sum_vector_byte(byte*, const byte*, const int); -void map_bytes_to_G1(E1*, const uint8_t*, int); -void map_bytes_to_G1complement(E1*, const uint8_t*, int); +void map_bytes_to_G1(E1*, const byte*, int); +void map_bytes_to_G1complement(E1*, const byte*, int); // E2 and G2 utilities void E2_set_infty(E2* p); @@ -131,8 +131,8 @@ void E2_add(E2* res, const E2* a, const E2* b); void E2_sum_vector(E2*, const E2*, const int); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); bool_t E2_in_G2(const E2*); -void map_bytes_to_G2(E2*, const uint8_t*, int); -BLST_ERROR map_bytes_to_G2complement(E2*, const uint8_t*, int); +void map_bytes_to_G2(E2*, const byte*, int); +BLST_ERROR map_bytes_to_G2complement(E2*, const byte*, int); // Utility functions ctx_t* relic_init_BLS12_381(); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index b5c142ad1bb..2fc03efe267 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -97,7 +97,7 @@ func BenchmarkMapToG1(b *testing.B) { // test subgroup membership check in G1 and G2 func TestSubgroupCheck(t *testing.T) { prg := getPRG(t) - seed := make([]byte, securityBits/8) + seed := make([]byte, PubKeyLenBLSBLS12381) _, err := prg.Read(seed) require.NoError(t, err) @@ -113,9 +113,6 @@ func TestSubgroupCheck(t *testing.T) { t.Run("G2", func(t *testing.T) { var p pointE2 - seed := make([]byte, PubKeyLenBLSBLS12381) - _, err := mrand.Read(seed) - require.NoError(t, err) mapToG2(&p, seed) // point in G2 assert.True(t, checkMembershipG2(&p)) From f10d8819dba06569d9c762b9b02eca44b362e6d9 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 12 May 2023 19:21:47 -0600 Subject: [PATCH 085/200] fix E1_read_bytes bug and improve debug printing --- crypto/bls12381_utils.c | 78 ++++++++++++++++++++---------------- crypto/bls12381_utils.h | 5 ++- crypto/bls_core.c | 23 +++++------ crypto/bls_crossBLST_test.go | 6 +-- crypto/bls_test.go | 1 + crypto/sign_test_utils.go | 1 + 6 files changed, 63 insertions(+), 51 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 15791a6dc56..fd1304e4ca3 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -783,11 +783,10 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { } // set a.z to 1 - Fp* a_z = &(a->z); - Fp_set_limb(a_z, 1); + Fp_copy(&a->z, &BLS12_381_pR); if (G1_SERIALIZATION == UNCOMPRESSED) { - ret = Fp_read_bytes(&(a->y), bin + Fp_BYTES, sizeof(a->y)); + ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y)); if (ret != BLST_SUCCESS){ return ret; } @@ -799,19 +798,16 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { } // compute the possible square root - Fp* a_x = &(a->x); - Fp_to_montg(a_x, a_x); - - Fp* a_y = &(a->y); - Fp_squ_montg(a_y, a_x); - Fp_mul_montg(a_y, a_y, a_x); - Fp_add(a_y, a_y, &B_E1); // B_E1 is already in Montg form - if (!Fp_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue + Fp_to_montg(&a->x, &a->x); + Fp_squ_montg(&a->y, &a->x); + Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 + Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in Montg form + if (!Fp_sqrt_montg(&a->y, &a->y)) // check whether x^3+b is a quadratic residue return BLST_POINT_NOT_ON_CURVE; // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) - if (Fp_get_sign(a_y) != y_sign) { - Fp_neg(a_y, a_y); // flip y sign if needed + if (Fp_get_sign(&a->y) != y_sign) { + Fp_neg(&a->y, &a->y); // flip y sign if needed } return BLST_SUCCESS; } @@ -828,20 +824,18 @@ void E1_write_bytes(byte *bin, const E1* a) { return; } E1 tmp; - E1_to_affine(&tmp, a); // TODO: implement + E1_to_affine(&tmp, a); - Fp* t_x = &(tmp.x); - Fp_from_montg(t_x, t_x); - Fp_write_bytes(bin, t_x); + Fp_from_montg(&tmp.x, &tmp.x); + Fp_write_bytes(bin, &tmp.x); - Fp* t_y = &(tmp.y); if (G1_SERIALIZATION == COMPRESSED) { - bin[0] |= (Fp_get_sign(t_y) << 5); + bin[0] |= (Fp_get_sign(&tmp.y) << 5); } else { - Fp_from_montg(t_y, t_y); - Fp_write_bytes(bin + Fp_BYTES, t_y); + Fp_from_montg(&tmp.y, &tmp.y); + Fp_write_bytes(bin + Fp_BYTES, &tmp.y); } - + // compression bit bin[0] |= (G1_SERIALIZATION << 7); } @@ -1424,10 +1418,12 @@ void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int // DEBUG printing functions +#define DEBUG 1 +#if DEBUG==1 void bytes_print_(char* s, byte* data, int len) { printf("[%s]:\n", s); for (int i=0; ix)); - Fp2_print_(".y", &(a->y)); - Fp2_print_(".z", &(a->z)); +void E1_print_(char* s, const E1* p, const int jacob) { + E1 a; E1_copy(&a, p); + if (!jacob) E1_to_affine(&a, &a); + printf("[%s]:\n", s); + Fp_print_(".x", &(a.x)); + Fp_print_(".y", &(a.y)); + if (jacob) Fp_print_(".z", &(a.z)); +} + +void E2_print_(char* s, const E2* p, const int jacob) { + E2 a; E2_copy(&a, p); + if (!jacob) E2_to_affine(&a, &a); + printf("[%s]:\n", s); + Fp2_print_(".x", &(a.x)); + Fp2_print_(".y", &(a.y)); + if (jacob) Fp2_print_(".z", &(a.z)); } @@ -1493,3 +1502,4 @@ void ep2_print_(char* s, ep2_st* p) { printf("[%s]:\n", s); g2_print(p); } +#endif diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index e3e845f0c19..4c39a454ab9 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -145,9 +145,10 @@ void xmd_sha256(byte *, int, byte *, int, byte *, int); // Debugging related functions void bytes_print_(char*, byte*, int); void Fr_print_(char*, Fr*); -void Fp_print_(char*, Fp*); +void Fp_print_(char*, const Fp*); void Fp2_print_(char*, const Fp2*); -void E2_print_(char*, const E2*); +void E1_print_(char*, const E1*, const int); +void E2_print_(char*, const E2*, const int); void fp_print_(char*, fp_t); void bn_print_(char*, bn_st*); void ep_print_(char*, ep_st*); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index f020ba968c7..92911d8317a 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -21,7 +21,7 @@ int get_sk_len() { // Computes a BLS signature from a G1 point and writes it in `out`. // `out` must be allocated properly with `G1_SER_BYTES` bytes. -static void bls_sign_ep(byte* out, const Fr* sk, const E1* h) { +static void bls_sign_E1(byte* out, const Fr* sk, const E1* h) { // s = h^s E1 s; E1_mult(&s, h, sk); @@ -29,16 +29,16 @@ static void bls_sign_ep(byte* out, const Fr* sk, const E1* h) { } // Computes a BLS signature from a hash and writes it in `out`. -// `hash` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. +// `hash` represents the hashed message with length `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. // `out` must be allocated properly with `G1_SER_BYTES` bytes. -int bls_sign(byte* out, const Fr* sk, const byte* hash, const int len) { +int bls_sign(byte* out, const Fr* sk, const byte* hash, const int hash_len) { // hash to G1 E1 h; - if (map_to_G1(&h, hash, len) != VALID) { + if (map_to_G1(&h, hash, hash_len) != VALID) { return INVALID; } // s = h^sk - bls_sign_ep(out, sk, &h); + bls_sign_E1(out, sk, &h); return VALID; } @@ -46,7 +46,7 @@ int bls_sign(byte* out, const Fr* sk, const byte* hash, const int len) { // and a message hash `h` (G1 point). // Hash, signature and public key are assumed to be in G1, G1 and G2 respectively. This // function only checks the pairing equality. -static int bls_verify_ep(const E2* pk, const E1* s, const E1* h) { +static int bls_verify_E1(const E2* pk, const E1* s, const E1* h) { ep_t elemsG1[2]; ep2_t elemsG2[2]; @@ -324,10 +324,9 @@ int bls_verifyPerDistinctKey(const byte* sig, // membership check of the signature in G1 is verified. // membership check of pk in G2 is not verified in this function. // the membership check in G2 is separated to optimize multiple verifications using the same key. -// `hash` represents the hashed message with length `len` equal to `MAP_TO_G1_INPUT_LEN`. -int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int len) { +// `hash` represents the hashed message with length `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. +int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int hash_len) { E1 s, h; - // deserialize the signature into a curve point if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) { return INVALID; @@ -338,11 +337,11 @@ int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int len) { return INVALID; } - if (map_to_G1(&h, hash, len) != VALID) { + if (map_to_G1(&h, hash, hash_len) != VALID) { return INVALID; } - return bls_verify_ep(pk, &s, &h); + return bls_verify_E1(pk, &s, &h); } @@ -428,7 +427,7 @@ static node* build_tree(const int len, const E2* pks, const E1* sigs) { // verify the binary tree and fill the results using recursive batch verifications. static void bls_batch_verify_tree(const node* root, const int len, byte* results, const E1* h) { // verify the aggregated signature against the aggregated public key. - int res = bls_verify_ep(root->pk, root->sig, h); + int res = bls_verify_E1(root->pk, root->sig, h); // if the result is valid, all the subtree signatures are valid. if (res == VALID) { diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index e9b1607a721..aabb5d0efaf 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -176,10 +176,10 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) { // // The test also assumes Flow signature serialization is identical to the one in BLST. func testSignHashCrossBLST(t *rapid.T) { - // generate two private keys from the same seed + // decode two private keys from the same bytes skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte) - skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) + require.NoError(t, err) var skBLST blst.Scalar res := skBLST.Deserialize(skBytes) @@ -194,7 +194,7 @@ func testSignHashCrossBLST(t *rapid.T) { sigBytesBLST := sigBLST.Compress() skFlowBLS, ok := skFlow.(*prKeyBLSBLS12381) - require.True(t, ok, "incoherent key type assertion") + require.True(t, ok) sigFlow := skFlowBLS.signWithXMDSHA256(message) sigBytesFlow := sigFlow.Bytes() diff --git a/crypto/bls_test.go b/crypto/bls_test.go index c3e9bb6e9db..bd7c1d7a86c 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -30,6 +30,7 @@ func TestBLSMainMethods(t *testing.T) { // This test checks that: // - signature decoding handles input x-coordinates larger than p (doesn't result in an exception) // - signature decoding only accepts reduced x-coordinates to avoid signature malleability + t.Run("invalid x coordinate larger than p", func(t *testing.T) { msg, err := hex.DecodeString("7f26ba692dc2da7ff828ef4675ff1cd6ab855fca0637b6dab295f1df8e51bc8bb1b8f0c6610aabd486cf1f098f2ddbc6691d94e10f928816f890a3d366ce46249836a595c7ea1828af52e899ba2ab627ab667113bb563918c5d5a787c414399487b4e3a7") require.NoError(t, err) diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go index 8e2cd1e931f..8362df83c7f 100644 --- a/crypto/sign_test_utils.go +++ b/crypto/sign_test_utils.go @@ -106,6 +106,7 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { require.NoError(t, err) assert.False(t, result, fmt.Sprintf( "Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen)) + } }) } From d3396c6d381330d7c3bf883a60d0c897471b325d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 15 May 2023 12:26:18 -0600 Subject: [PATCH 086/200] update BLS threshold signature with E1 points --- crypto/bls12381_utils.c | 83 +++++++++++++++++++++------------ crypto/bls_thresholdsign_core.c | 49 +++++++------------ 2 files changed, 69 insertions(+), 63 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index fd1304e4ca3..655b3e3f8e6 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1214,40 +1214,52 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){ // Membership check in G2 of both keys is not verified in this function. // the membership check in G2 is separated to allow optimizing multiple verifications // using the same public keys. -/*int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) { +int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) { ep_t elemsG1[2]; ep2_t elemsG2[2]; - - // elemsG1[0] = s1 ep_new(elemsG1[0]); - int read_ret = ep_read_bin_compact(elemsG1[0], sig1, SIGNATURE_LEN); - if (read_ret != RLC_OK) - return read_ret; + ep_new(elemsG1[1]); + ep2_new(elemsG2[1]); + ep2_new(elemsG2[0]); + int ret; + // elemsG1[0] = s1 + E1 s; + if (E1_read_bytes(&s, sig1, SIGNATURE_LEN) != BLST_SUCCESS) { + ret = INVALID; + goto out; + }; // check s1 is in G1 - if (E1_in_G1(elemsG1[0]) != VALID) - return INVALID; + if (E1_in_G1(&s) != VALID) { + ret = INVALID; + goto out; + } + ep_st* s_tmp = E1_blst_to_relic(&s); + ep_copy(elemsG1[0], s_tmp); // elemsG1[1] = s2 - ep_new(elemsG1[1]); - read_ret = ep_read_bin_compact(elemsG1[1], sig2, SIGNATURE_LEN); - if (read_ret != RLC_OK) - return read_ret; - - // check s2 in G1 - if (E1_in_G1(elemsG1[1]) != VALID) - return INVALID; + E1 s; + if (E1_read_bytes(&s, sig2, SIGNATURE_LEN) != BLST_SUCCESS) { + ret = INVALID; + goto out; + }; + // check s2 is in G1 + if (E1_in_G1(&s) != VALID) { + ret = INVALID; + goto out; + } + s_tmp = E1_blst_to_relic(&s); + ep_copy(elemsG1[1], s_tmp); // elemsG2[1] = pk1 - ep2_new(elemsG2[1]); - ep2_st* tmp = E2_blst_to_relic(pk1); - ep2_copy(elemsG2[1], tmp); + ep2_st* pk_tmp = E2_blst_to_relic(pk1); + ep2_copy(elemsG2[1], pk_tmp); // elemsG2[0] = pk2 - ep2_new(elemsG2[0]); - tmp = E2_blst_to_relic(pk2); - ep2_copy(elemsG2[0], tmp); - free(tmp); + pk_tmp = E2_blst_to_relic(pk2); + ep2_copy(elemsG2[0], pk_tmp); + free(pk_tmp); + free(s_tmp); #if DOUBLE_PAIRING // elemsG2[0] = -pk2 @@ -1260,6 +1272,7 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){ // compare the result to 1 int res = fp12_cmp_dig(pair, 1); + fp12_free(pair); #elif SINGLE_PAIRING fp12_t pair1, pair2; @@ -1268,19 +1281,27 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){ pp_map_oatep_k12(pair2, elemsG1[1], elemsG2[1]); int res = fp12_cmp(pair1, pair2); + fp12_free(pair1); fp12_free(pair2); #endif - fp12_free(&one); + + if (core_get()->code == RLC_OK) { + if (res == RLC_EQ) { + ret = VALID; + } + else { + ret = INVALID; + } + goto out; + } + ret = UNDEFINED; + +out: ep_free(elemsG1[0]); ep_free(elemsG1[1]); ep2_free(elemsG2[0]); ep2_free(elemsG2[1]); - - if (core_get()->code == RLC_OK) { - if (res == RLC_EQ) return VALID; - return INVALID; - } - return UNDEFINED; -}*/ + return ret; +} // Subtracts all G2 array elements `y` from an element `x` and writes the // result in res diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index 777af1ef5e9..027579d3dae 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -65,28 +65,21 @@ static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indice // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] // and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`. // `len` is equal to `t+1` where `t` is the polynomial degree. -static void E1_lagrange_interpolate_at_zero(ep_st* dest, const ep_st shares[], const uint8_t indices[], const int len) { +static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const uint8_t indices[], const int len) { // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1 // where A_i = g1 ^ a_i // Q(0) = share_i0 ^ L_i0(0) + share_i1 ^ L_i1(0) + .. + share_it ^ L_it(0) // where L is the Lagrange coefficient - // temp variables - ep_t mult; - ep_new(mult); - ep_set_infty(dest); - + E1_set_infty(out); Fr fr_lagr_coef; - for (int i=0; i < len; i++) { - Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len); - bn_st* bn_lagr_coef = Fr_blst_to_relic(&fr_lagr_coef); - ep_mul_lwnaf(mult, &shares[i], bn_lagr_coef); - free(bn_lagr_coef); - ep_add_jacob(dest, dest, mult); + E1 mult; + for (int i=0; i < len; i++) { + Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len); + E1_mult(&mult, &shares[i], &fr_lagr_coef); + E1_add(out, out, &mult); } - // free the temp memory - ep_free(mult); } // Computes the Langrange interpolation at zero LI(0) with regards to the indices [indices(0)..indices(t)] @@ -94,33 +87,25 @@ static void E1_lagrange_interpolate_at_zero(ep_st* dest, const ep_st shares[], c // `len` is equal to `t+1` where `t` is the polynomial degree. int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const uint8_t indices[], const int len) { int read_ret; - // temp variables - ep_t res; - ep_new(res); - ep_st* ep_shares = malloc(sizeof(ep_t) * len); - + E1* E1_shares = malloc(sizeof(E1) * len); for (int i=0; i < len; i++) { - ep_new(ep_shares[i]); - read_ret = ep_read_bin_compact(&ep_shares[i], &shares[SIGNATURE_LEN*i], SIGNATURE_LEN); - if (read_ret != RLC_OK) goto out; - + read_ret = E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES); + if (read_ret != BLST_SUCCESS) { + goto out; + } } + // G1 interpolation at 0 // computes Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1, // where A_i = g1 ^ a_i - E1_lagrange_interpolate_at_zero(res, ep_shares, indices, len); - + E1 res; + E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, len); // export the result - ep_write_bin_compact(dest, res, SIGNATURE_LEN); + E1_write_bytes(dest, &res); read_ret = VALID; - out: // free the temp memory - ep_free(res); - for (int i=0; i < len; i++) { - ep_free(ep_shares[i]); - } - free(ep_shares); + free(E1_shares); return read_ret; } From 557d3a71523fe5cc8d37efe61346b0895d34866e Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 15 May 2023 12:28:49 -0600 Subject: [PATCH 087/200] uncomment BLST cross check tests --- crypto/bls_crossBLST_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index aabb5d0efaf..e4e957ea495 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -217,6 +217,6 @@ func TestCrossBLST(t *testing.T) { rapid.Check(t, testKeyGenCrossBLST) rapid.Check(t, testEncodeDecodePrivateKeyCrossBLST) rapid.Check(t, testEncodeDecodePublicKeyCrossBLST) - //rapid.Check(t, testEncodeDecodeG1CrossBLST) // commented till G1 check is implemented + rapid.Check(t, testEncodeDecodeG1CrossBLST) // commented till G1 check is implemented rapid.Check(t, testSignHashCrossBLST) } From 878b7e7fabbde68d584aeacfd9ae62d4d999153a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 15 May 2023 12:43:03 -0600 Subject: [PATCH 088/200] spock works with new E1 type --- crypto/bls12381_utils.c | 8 +++----- crypto/spock.go | 8 ++++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 655b3e3f8e6..8e770c182f8 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1221,7 +1221,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* ep_new(elemsG1[1]); ep2_new(elemsG2[1]); ep2_new(elemsG2[0]); - int ret; + int ret = UNDEFINED; // elemsG1[0] = s1 E1 s; @@ -1230,7 +1230,7 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* goto out; }; // check s1 is in G1 - if (E1_in_G1(&s) != VALID) { + if (!E1_in_G1(&s)) { ret = INVALID; goto out; } @@ -1238,13 +1238,12 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* ep_copy(elemsG1[0], s_tmp); // elemsG1[1] = s2 - E1 s; if (E1_read_bytes(&s, sig2, SIGNATURE_LEN) != BLST_SUCCESS) { ret = INVALID; goto out; }; // check s2 is in G1 - if (E1_in_G1(&s) != VALID) { + if (!E1_in_G1(&s)) { ret = INVALID; goto out; } @@ -1293,7 +1292,6 @@ int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* } goto out; } - ret = UNDEFINED; out: ep_free(elemsG1[0]); diff --git a/crypto/spock.go b/crypto/spock.go index dad711d9163..4fbd974c27f 100644 --- a/crypto/spock.go +++ b/crypto/spock.go @@ -90,10 +90,10 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur } // verify the spock proof using the secret data - verif := valid /*:= C.bls_spock_verify((*C.E2)(&blsPk1.point), - (*C.uchar)(&proof1[0]), - (*C.E2)(&blsPk2.point), - (*C.uchar)(&proof2[0]))*/ + verif := C.bls_spock_verify((*C.E2)(&blsPk1.point), + (*C.uchar)(&proof1[0]), + (*C.E2)(&blsPk2.point), + (*C.uchar)(&proof2[0])) switch verif { case invalid: From 49b01a8f28c49ced5a0489093d48b8db9fe19f1f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 15 May 2023 13:07:06 -0600 Subject: [PATCH 089/200] write E1_sum_vector using new E1 type --- crypto/bls12381_utils.c | 82 +++++++++++++++++++---------------------- crypto/bls12381_utils.h | 11 +++--- crypto/bls_multisig.go | 4 +- 3 files changed, 44 insertions(+), 53 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 8e770c182f8..2e56ed6e387 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -860,6 +860,43 @@ void E1_sum_vector(E1* sum, const E1* y, const int len){ } } +// Computes the sum of input signatures (E1 elements) flattened in a single byte array +// `sigs_bytes` of `sigs_len` bytes. +// and writes the sum (E1 element) as bytes in `dest`. +// The function does not check membership of E1 inputs in G1 subgroup. +// The header is using byte pointers to minimize Cgo calls from the Go layer. +int E1_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int sigs_len) { + int error = UNDEFINED; + // sanity check that `len` is multiple of `G1_SER_BYTES` + if (sigs_len % G1_SER_BYTES) { + error = INVALID; + goto mem_error; + } + int n = sigs_len/G1_SER_BYTES; // number of signatures + + E1* sigs = (E1*) malloc(n * sizeof(E1)); + if (!sigs) goto mem_error; + + // import the points from the array + for (int i=0; i < n; i++) { + // deserialize each point from the input array + if (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES) != BLST_SUCCESS) { + error = INVALID; + goto out; + } + } + // sum the points + E1 acc; + E1_sum_vector(&acc, sigs, n); + // export the result + E1_write_bytes(dest, &acc); + error = VALID; +out: + free(sigs); +mem_error: + return error; +} + // Exponentiation of generator g1 of G1, res = expo.g1 void G1_mult_gen(E1* res, const Fr* expo) { pow256 tmp; @@ -1309,51 +1346,6 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){ E2_add(res, x, res); } -// computes the sum of the G1 array elements y and writes the sum in jointy -void ep_sum_vector(ep_t jointx, ep_st* x, const int len) { - ep_set_infty(jointx); - for (int i=0; i Date: Mon, 15 May 2023 13:51:34 -0600 Subject: [PATCH 090/200] clean up unsecure mapping to G1/G1 and fix subgroup checks --- crypto/bls.go | 11 -------- crypto/bls12381_utils.c | 49 +++++++++++++++++++---------------- crypto/bls12381_utils.go | 46 ++++++++++++++++++++------------ crypto/bls12381_utils.h | 8 +++--- crypto/bls12381_utils_test.go | 41 +++++++++++++++-------------- 5 files changed, 82 insertions(+), 73 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index a8caa047ee7..8cfd435b380 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -549,17 +549,6 @@ func (a *blsBLS12381Algo) init() error { return nil } -// This is only a TEST/DEBUG/BENCH function. -// It returns the hash-to-G1 point from a slice of 128 bytes -func mapToG1(data []byte) *pointE1 { - l := len(data) - var h pointE1 - if C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) != valid { - return nil - } - return &h -} - // This is only a TEST function. // signWithXMDSHA256 signs a message using XMD_SHA256 as a hash to field. // diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 2e56ed6e387..a0c48795936 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1346,44 +1346,49 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){ E2_add(res, x, res); } -/* -// maps the bytes to a point in G1 + +// maps the bytes to a point in G1. +// `len` should be at least Fr_BYTES. // this is a testing file only, should not be used in any protocol! -void map_bytes_to_G1(ep_t p, const byte* bytes, int len) { +void unsecure_map_bytes_to_G1(E1* p, const byte* bytes, int len) { + assert(len > Fr_BYTES); // map to Fr Fr log; map_bytes_to_Fr(&log, bytes, len); // multiplies G1 generator by a random scalar - - + G1_mult_gen(p, &log); } // generates a point in E1\G1 and stores it in p // this is a testing file only, should not be used in any protocol! -void map_bytes_to_G1complement(ep_t p, const byte* bytes, int len) { - // generate a random point in E1 - p->coord = BASIC; - fp_set_dig(p->z, 1); - do { - fp_rand(p->x); // set x to a random field element - byte r; - rand_bytes(&r, 1); - fp_zero(p->y); - fp_set_bit(p->y, 0, r&1); // set y randomly to 0 or 1 +BLST_ERROR unsecure_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) { + assert(G1_SERIALIZATION == COMPRESSED); + assert(len >= G1_SER_BYTES); + + // attempt to deserilize a compressed E1 point from input bytes + // after fixing the header 2 bits + byte copy[G1_SER_BYTES]; + memcpy(copy, bytes, sizeof(copy)); + copy[0] |= 1<<7; // set compression bit + copy[0] &= ~(1<<6); // clear infinity bit - point is not infinity + + BLST_ERROR ser = E1_read_bytes(p, copy, G1_SER_BYTES); + if (ser != BLST_SUCCESS) { + return ser; } - while (ep_upk(p, p) == 0); // make sure p is in E1 - // map the point to E1\G1 by clearing G1 order - ep_mul_basic(p, p, &core_get()->ep_r); + // map the point to E2\G2 by clearing G2 order + E1_mult(p, p, (const Fr*)BLS12_381_r); + E1_to_affine(p, p); - assert(ep_on_curve(p)); // sanity check to make sure p is in E1 + assert(E1_affine_on_curve(p)); // sanity check to make sure p is in E2 + return BLST_SUCCESS; } -*/ // maps the bytes to a point in G2. // `len` should be at least Fr_BYTES. // this is a testing tool only, it should not be used in any protocol! -void map_bytes_to_G2(E2* p, const byte* bytes, int len) { +void unsecure_map_bytes_to_G2(E2* p, const byte* bytes, int len) { assert(len > Fr_BYTES); // map to Fr Fr log; @@ -1397,7 +1402,7 @@ void map_bytes_to_G2(E2* p, const byte* bytes, int len) { // succeeds. // For now, function only works when E2 serialization is compressed. // this is a testing tool only, it should not be used in any protocol! -BLST_ERROR map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { +BLST_ERROR unsecure_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { assert(G2_SERIALIZATION == COMPRESSED); assert(len >= G2_SER_BYTES); diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 253d8904ca1..1fb9808edb6 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -256,30 +256,42 @@ func checkMembershipG2(pt *pointE2) bool { return C.E2_in_G2((*C.E2)(pt)) != (C.ulonglong)(0) } -/* -// randPointG1 wraps a call to C since cgo can't be used in go test files. -// It generates a random point in G1 and stores it in input point. -func randPointG1(pt *pointE1) { - C.ep_rand_G1((*C.E1)(pt)) +// This is only a TEST/DEBUG/BENCH function. +// It returns the hash-to-G1 point from a slice of 128 bytes +func mapToG1(data []byte) *pointE1 { + l := len(data) + var h pointE1 + if C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) != valid { + return nil + } + return &h +} + +// mapToG1 is a test function, it wraps a call to C since cgo can't be used in go test files. +// It maps input bytes to a point in G2 and stores it in input point. +// THIS IS NOT the kind of mapping function that is used in BLS signature. +func unsecureMapToG1(pt *pointE1, seed []byte) { + C.unsecure_map_bytes_to_G1((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } -// randPointG1Complement wraps a call to C since cgo can't be used in go test files. -// It generates a random point in E1\G1 and stores it in input point. -func randPointG1Complement(pt *pointE1) { - C.ep_rand_G1complement((*C.E1)(pt)) +// unsecureMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files. +// It generates a random point in E2\G2 and stores it in input point. +func unsecureMapToG1Complement(pt *pointE1, seed []byte) bool { + res := C.unsecure_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) + return int(res) == blst_valid } -*/ -// mapToG2 wraps a call to C since cgo can't be used in go test files. -// It generates a random point in G2 and stores it in input point. -func mapToG2(pt *pointE2, src []byte) { - C.map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&src[0]), (C.int)(len(src))) +// unsecureMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files. +// It maps input bytes to a point in G2 and stores it in input point. +// THIS IS NOT the kind of mapping function that is used in BLS signature. +func unsecureMapToG2(pt *pointE2, seed []byte) { + C.unsecure_map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } -// mapToG2Complement wraps a call to C since cgo can't be used in go test files. +// unsecureMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. -func mapToG2Complement(pt *pointE2, src []byte) bool { - res := C.map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&src[0]), (C.int)(len(src))) +func unsecureMapToG2Complement(pt *pointE2, seed []byte) bool { + res := C.unsecure_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) return int(res) == blst_valid } diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index ffe8fd0f650..61a6af53069 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -111,8 +111,8 @@ void E1_write_bytes(byte *, const E1*); ep_st* E1_blst_to_relic(const E1* x); int ep_read_bin_compact(ep_t, const byte *, const int); void ep_write_bin_compact(byte *, const ep_t, const int); -void map_bytes_to_G1(E1*, const byte*, int); -void map_bytes_to_G1complement(E1*, const byte*, int); +void unsecure_map_bytes_to_G1(E1*, const byte*, int); +BLST_ERROR unsecure_map_bytes_to_G1complement(E1*, const byte*, int); // E2 and G2 utilities void E2_set_infty(E2* p); @@ -130,8 +130,8 @@ void E2_add(E2* res, const E2* a, const E2* b); void E2_sum_vector(E2*, const E2*, const int); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); bool_t E2_in_G2(const E2*); -void map_bytes_to_G2(E2*, const byte*, int); -BLST_ERROR map_bytes_to_G2complement(E2*, const byte*, int); +void unsecure_map_bytes_to_G2(E2*, const byte*, int); +BLST_ERROR unsecure_map_bytes_to_G2complement(E2*, const byte*, int); // Utility functions ctx_t* relic_init_BLS12_381(); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 2fc03efe267..e5207d9f68a 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -101,26 +101,30 @@ func TestSubgroupCheck(t *testing.T) { _, err := prg.Read(seed) require.NoError(t, err) - /*t.Run("G1", func(t *testing.T) { + t.Run("G1", func(t *testing.T) { var p pointE1 - randPointG1(&p) // point in G1 - res := checkMembershipG1(&p) - assert.Equal(t, res, int(valid)) - randPointG1Complement(&p) // point in E1\G1 - res = checkMembershipG1(&p) - assert.Equal(t, res, int(invalid)) - })*/ + unsecureMapToG1(&p, seed) // point in G1 + assert.True(t, checkMembershipG1(&p)) + + inG1 := false + for !inG1 { + _, err := prg.Read(seed) + require.NoError(t, err) + inG1 = unsecureMapToG1Complement(&p, seed) // point in E2\G2 + } + assert.False(t, checkMembershipG1(&p)) + }) t.Run("G2", func(t *testing.T) { var p pointE2 - mapToG2(&p, seed) // point in G2 + unsecureMapToG2(&p, seed) // point in G2 assert.True(t, checkMembershipG2(&p)) inG2 := false for !inG2 { - _, err := mrand.Read(seed) + _, err := prg.Read(seed) require.NoError(t, err) - inG2 = mapToG2Complement(&p, seed) // point in E2\G2 + inG2 = unsecureMapToG2Complement(&p, seed) // point in E2\G2 } assert.False(t, checkMembershipG2(&p)) }) @@ -128,24 +132,23 @@ func TestSubgroupCheck(t *testing.T) { // subgroup membership check bench func BenchmarkSubgroupCheck(b *testing.B) { + seed := make([]byte, PubKeyLenBLSBLS12381) + _, err := mrand.Read(seed) + require.NoError(b, err) - /*b.Run("G1", func(b *testing.B) { + b.Run("G1", func(b *testing.B) { var p pointE1 - randPointG1(&p) + unsecureMapToG1(&p, seed) // point in G1 b.ResetTimer() for i := 0; i < b.N; i++ { _ = checkMembershipG1(&p) // G1 } b.StopTimer() - })*/ + }) b.Run("G2", func(b *testing.B) { var p pointE2 - seed := make([]byte, PubKeyLenBLSBLS12381) - _, err := mrand.Read(seed) - require.NoError(b, err) - mapToG2(&p, seed) // point in G2 - + unsecureMapToG2(&p, seed) // point in G2 b.ResetTimer() for i := 0; i < b.N; i++ { _ = checkMembershipG2(&p) // G2 From c338c934ed53f111cd706a0ba553931731925fb0 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 15 May 2023 14:09:35 -0600 Subject: [PATCH 091/200] clean up constants pre-computation --- crypto/bls12381_utils.c | 111 ++++----------------------------------- crypto/bls12381_utils.go | 2 - crypto/bls12381_utils.h | 18 +------ 3 files changed, 11 insertions(+), 120 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index a0c48795936..c2058c4148d 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -52,37 +52,10 @@ ctx_t* relic_init_BLS12_381() { ep_param_set(B12_P381); ep2_curve_set_twist(EP_MTYPE); // Multiplicative twist #endif - if (ret != RLC_OK) return NULL; return core_get(); } -// global variable of the pre-computed data -prec_st bls_prec_st; -prec_st* bls_prec = NULL; - -// sets the global variable to input -void precomputed_data_set(const prec_st* p) { - bls_prec = (prec_st*)p; -} - -// Reads a prime field element from a digit vector in big endian format. -// There is no conversion to Montgomery domain in this function. -#define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_LIMBS) - -// pre-compute some data required for curve BLS12-381 -prec_st* init_precomputed_data_BLS12_381() { - bls_prec = &bls_prec_st; - ctx_t* ctx = core_get(); - - // (p-1)/2 - bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2); - - // Montgomery constant R - fp_set_dig(bls_prec->r, 1); - return bls_prec; -} - // ------------------- Fr utilities // Montgomery constant R related to the curve order r @@ -92,27 +65,6 @@ const Fr BLS12_381_rR = {{ \ TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \ }}; -// TODO: temp utility function to delete -bn_st* Fr_blst_to_relic(const Fr* x) { - bn_st* out = (bn_st*)malloc(sizeof(bn_st)); - byte* data = (byte*)malloc(Fr_BYTES); - be_bytes_from_limbs(data, (limb_t*)x, Fr_BYTES); - out->alloc = RLC_DV_DIGS; - bn_read_bin(out, data, Fr_BYTES); - free(data); - return out; -} - -// TODO: temp utility function to delete -Fr* Fr_relic_to_blst(const bn_st* x){ - Fr* out = (Fr*)malloc(sizeof(Fr)); - byte* data = (byte*)malloc(Fr_BYTES); - bn_write_bin(data, Fr_BYTES, x); - Fr_read_bytes(out, data, Fr_BYTES); - free(data); - return out; -} - // returns true if a == 0 and false otherwise bool_t Fr_is_zero(const Fr* a) { return bytes_are_zero((const byte*)a, sizeof(Fr)); @@ -566,18 +518,8 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) { // ------------------- E1 utilities -// TODO: temp utility function to delete -ep_st* E1_blst_to_relic(const E1* x) { - ep_st* out = (ep_st*)malloc(sizeof(ep_st)); - byte* data = (byte*)malloc(G1_SER_BYTES); - E1_write_bytes(data, x); - ep_read_bin_compact(out, data, G1_SER_BYTES); - free(data); - return out; -} - // TODO: to delete, only used by temporary E2_blst_to_relic -int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { +static int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { // check the length const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); if (len!=G1_size) { @@ -641,51 +583,16 @@ int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { return RLC_ERR; } - -// TODO: delete aftet deleting ep_write_bin_compact -static int fp_get_sign(const fp_t y) { - bn_t bn_y; - bn_new(bn_y); - fp_prime_back(bn_y, y); - return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT; +// TODO: temp utility function to delete +ep_st* E1_blst_to_relic(const E1* x) { + ep_st* out = (ep_st*)malloc(sizeof(ep_st)); + byte* data = (byte*)malloc(G1_SER_BYTES); + E1_write_bytes(data, x); + ep_read_bin_compact(out, data, G1_SER_BYTES); + free(data); + return out; } -// TODO: to delete, only used by temporary E2_blst_to_relic -void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { - const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); - - if (len!=G1_size) { - RLC_THROW(ERR_NO_BUFFER); - return; - } - - if (ep_is_infty(a)) { - // set the infinity bit - bin[0] = (G1_SERIALIZATION << 7) | (1<<6); - memset(bin+1, 0, G1_size-1); - return; - } - - RLC_TRY { - ep_t t; - ep_null(t); - ep_new(t); - ep_norm(t, a); - fp_write_bin(bin, Fp_BYTES, t->x); - - if (G1_SERIALIZATION == COMPRESSED) { - bin[0] |= (fp_get_sign(t->y) << 5); - } else { - fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y); - } - ep_free(t); - } RLC_CATCH_ANY { - RLC_THROW(ERR_CAUGHT); - } - - bin[0] |= (G1_SERIALIZATION << 7); - } - void E1_copy(E1* res, const E1* p) { vec_copy(res, p, sizeof(E1)); } diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 1fb9808edb6..9ab3e22545d 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -54,7 +54,6 @@ var frBytesLen = int(C.get_Fr_BYTES()) // context required for the BLS set-up type ctx struct { relicCtx *C.ctx_t - precCtx *C.prec_st } // get some constants from the C layer @@ -88,7 +87,6 @@ func (ct *ctx) initContext() error { return errors.New("Relic core init failed") } ct.relicCtx = c - ct.precCtx = C.init_precomputed_data_BLS12_381() return nil } diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 61a6af53069..23a28299722 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -42,17 +42,9 @@ #define G2_SER_BYTES (G2_BYTES/(G2_SERIALIZATION+1)) -// Structure of precomputed data -typedef struct prec_ { - // other field-related constants - bn_st p_1div2; - fp_t r; // Montgomery multiplication constant -} prec_st; - // TODO: to delete when Relic is removed -bn_st* Fr_blst_to_relic(const Fr* x); -Fr* Fr_relic_to_blst(const bn_st* x); -ep2_st* E2_blst_to_relic(const E2* x); +ep2_st* E2_blst_to_relic(const E2* x); +ep_st* E1_blst_to_relic(const E1* x); int get_valid(); int get_invalid(); @@ -107,10 +99,6 @@ int E1_sum_vector_byte(byte*, const byte*, const int); void G1_mult_gen(E1*, const Fr*); BLST_ERROR E1_read_bytes(E1*, const byte *, const int); void E1_write_bytes(byte *, const E1*); - -ep_st* E1_blst_to_relic(const E1* x); -int ep_read_bin_compact(ep_t, const byte *, const int); -void ep_write_bin_compact(byte *, const ep_t, const int); void unsecure_map_bytes_to_G1(E1*, const byte*, int); BLST_ERROR unsecure_map_bytes_to_G1complement(E1*, const byte*, int); @@ -135,8 +123,6 @@ BLST_ERROR unsecure_map_bytes_to_G2complement(E2*, const byte*, int); // Utility functions ctx_t* relic_init_BLS12_381(); -prec_st* init_precomputed_data_BLS12_381(); -void precomputed_data_set(const prec_st* p); // utility testing function void xmd_sha256(byte *, int, byte *, int, byte *, int); From 3fd05397c559270d910c8cdbcf22b0974ebae5c7 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 15 May 2023 17:35:12 -0600 Subject: [PATCH 092/200] add read/write tests for G1 points --- crypto/bls12381_utils.c | 10 ++++++++-- crypto/bls12381_utils.go | 7 ++++++- crypto/bls12381_utils.h | 3 ++- crypto/bls12381_utils_test.go | 36 +++++++++++++++++++++++++++++++++++ crypto/bls_crossBLST_test.go | 3 +-- 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index c2058c4148d..cd19f3ad5fa 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -597,6 +597,12 @@ void E1_copy(E1* res, const E1* p) { vec_copy(res, p, sizeof(E1)); } +// checks p1 == p2 +bool_t E1_is_equal(const E1* p1, const E1* p2) { + // `POINTonE1_is_equal` includes the infinity case + return POINTonE1_is_equal((const POINTonE1*)p1, (const POINTonE1*)p2); +} + // compare p to infinity bool_t E1_is_infty(const E1* p) { // BLST infinity points are defined by Z=0 @@ -1258,7 +1264,7 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){ // `len` should be at least Fr_BYTES. // this is a testing file only, should not be used in any protocol! void unsecure_map_bytes_to_G1(E1* p, const byte* bytes, int len) { - assert(len > Fr_BYTES); + assert(len >= Fr_BYTES); // map to Fr Fr log; map_bytes_to_Fr(&log, bytes, len); @@ -1296,7 +1302,7 @@ BLST_ERROR unsecure_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) // `len` should be at least Fr_BYTES. // this is a testing tool only, it should not be used in any protocol! void unsecure_map_bytes_to_G2(E2* p, const byte* bytes, int len) { - assert(len > Fr_BYTES); + assert(len >= Fr_BYTES); // map to Fr Fr log; map_bytes_to_Fr(&log, bytes, len); diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 9ab3e22545d..bc1b9dc064a 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -111,7 +111,12 @@ func (x *scalar) equals(other *scalar) bool { return C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other)) != 0 } -// comparison in G2 +// comparison in E1 +func (p *pointE1) equals(other *pointE1) bool { + return C.E1_is_equal((*C.E1)(p), (*C.E1)(other)) != 0 +} + +// comparison in E2 func (p *pointE2) equals(other *pointE2) bool { return C.E2_is_equal((*C.E2)(p), (*C.E2)(other)) != 0 } diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 23a28299722..5c6aab8313d 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -87,6 +87,7 @@ void Fp_squ_montg(Fp *, const Fp *); // E1 and G1 utilities void E1_copy(E1*, const E1*); +bool_t E1_is_equal(const E1*, const E1*); void E1_set_infty(E1*); bool_t E1_is_infty(const E1*); void E1_to_affine(E1*, const E1*); @@ -106,7 +107,7 @@ BLST_ERROR unsecure_map_bytes_to_G1complement(E1*, const byte*, int); void E2_set_infty(E2* p); bool_t E2_is_infty(const E2*); bool_t E2_affine_on_curve(const E2*); -bool_t E2_is_equal(const E2* p1, const E2* p2); +bool_t E2_is_equal(const E2*, const E2*); void E2_copy(E2*, const E2*); void E2_to_affine(E2*, const E2*); BLST_ERROR E2_read_bytes(E2*, const byte *, const int); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index e5207d9f68a..10db3d57714 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -157,6 +157,42 @@ func BenchmarkSubgroupCheck(b *testing.B) { }) } +// specific test of G1 points Encode and decode (BLS signature since the library is set for min_sig). +// G2 points read and write are implicitly tested by public keys Encode/Decode. +func TestReadWriteG1(t *testing.T) { + prg := getPRG(t) + seed := make([]byte, frBytesLen) + bytes := make([]byte, SignatureLenBLSBLS12381) + // generate a random G1 point, encode it, decode it, + // and compare it the original point + iterations := 50 + t.Run("random points", func(t *testing.T) { + for i := 0; i < iterations; i++ { + var p, q pointE1 + _, err := prg.Read(seed) + unsecureMapToG1(&p, seed) + require.NoError(t, err) + writePointE1(bytes, &p) + err = readPointE1(&q, bytes) + require.NoError(t, err) + assert.True(t, p.equals(&q)) + } + }) + + t.Run("infinity", func(t *testing.T) { + for i := 0; i < iterations; i++ { + var p, q pointE1 + seed := make([]byte, frBytesLen) + unsecureMapToG1(&p, seed) // this results in the infinity point + writePointE1(bytes, &p) + require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check + err := readPointE1(&q, bytes) + require.NoError(t, err) + assert.True(t, p.equals(&q)) + } + }) +} + // test some edge cases of MapToFr to validate modular reduction and endianness: // - inputs `0` and curve order `r` // - inputs `1` and `r+1` diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index e4e957ea495..ffdb156e251 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -160,7 +160,6 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) { sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381) writePointE1(sigFlowOutBytes, &pointFlow) sigBLSTOutBytes := pointBLST.Compress() - assert.Equal(t, sigFlowOutBytes, sigBLSTOutBytes) } } @@ -217,6 +216,6 @@ func TestCrossBLST(t *testing.T) { rapid.Check(t, testKeyGenCrossBLST) rapid.Check(t, testEncodeDecodePrivateKeyCrossBLST) rapid.Check(t, testEncodeDecodePublicKeyCrossBLST) - rapid.Check(t, testEncodeDecodeG1CrossBLST) // commented till G1 check is implemented + rapid.Check(t, testEncodeDecodeG1CrossBLST) rapid.Check(t, testSignHashCrossBLST) } From 29f748910d580d87823123fed95ed06abdee45cb Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 15 May 2023 18:31:10 -0600 Subject: [PATCH 093/200] rename insecure to unsafe --- crypto/bls12381_utils.c | 8 ++++---- crypto/bls12381_utils.go | 22 +++++++++++----------- crypto/bls12381_utils.h | 8 ++++---- crypto/bls12381_utils_test.go | 16 ++++++++-------- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index cd19f3ad5fa..ca92f4f85a2 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1263,7 +1263,7 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){ // maps the bytes to a point in G1. // `len` should be at least Fr_BYTES. // this is a testing file only, should not be used in any protocol! -void unsecure_map_bytes_to_G1(E1* p, const byte* bytes, int len) { +void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) { assert(len >= Fr_BYTES); // map to Fr Fr log; @@ -1274,7 +1274,7 @@ void unsecure_map_bytes_to_G1(E1* p, const byte* bytes, int len) { // generates a point in E1\G1 and stores it in p // this is a testing file only, should not be used in any protocol! -BLST_ERROR unsecure_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) { +BLST_ERROR unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) { assert(G1_SERIALIZATION == COMPRESSED); assert(len >= G1_SER_BYTES); @@ -1301,7 +1301,7 @@ BLST_ERROR unsecure_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) // maps the bytes to a point in G2. // `len` should be at least Fr_BYTES. // this is a testing tool only, it should not be used in any protocol! -void unsecure_map_bytes_to_G2(E2* p, const byte* bytes, int len) { +void unsafe_map_bytes_to_G2(E2* p, const byte* bytes, int len) { assert(len >= Fr_BYTES); // map to Fr Fr log; @@ -1315,7 +1315,7 @@ void unsecure_map_bytes_to_G2(E2* p, const byte* bytes, int len) { // succeeds. // For now, function only works when E2 serialization is compressed. // this is a testing tool only, it should not be used in any protocol! -BLST_ERROR unsecure_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { +BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { assert(G2_SERIALIZATION == COMPRESSED); assert(len >= G2_SER_BYTES); diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index bc1b9dc064a..e83359263ea 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -273,28 +273,28 @@ func mapToG1(data []byte) *pointE1 { // mapToG1 is a test function, it wraps a call to C since cgo can't be used in go test files. // It maps input bytes to a point in G2 and stores it in input point. // THIS IS NOT the kind of mapping function that is used in BLS signature. -func unsecureMapToG1(pt *pointE1, seed []byte) { - C.unsecure_map_bytes_to_G1((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) +func unsafeMapToG1(pt *pointE1, seed []byte) { + C.unsafe_map_bytes_to_G1((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } -// unsecureMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files. +// unsafeMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. -func unsecureMapToG1Complement(pt *pointE1, seed []byte) bool { - res := C.unsecure_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) +func unsafeMapToG1Complement(pt *pointE1, seed []byte) bool { + res := C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) return int(res) == blst_valid } -// unsecureMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files. +// unsafeMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files. // It maps input bytes to a point in G2 and stores it in input point. // THIS IS NOT the kind of mapping function that is used in BLS signature. -func unsecureMapToG2(pt *pointE2, seed []byte) { - C.unsecure_map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) +func unsafeMapToG2(pt *pointE2, seed []byte) { + C.unsafe_map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } -// unsecureMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files. +// unsafeMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. -func unsecureMapToG2Complement(pt *pointE2, seed []byte) bool { - res := C.unsecure_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) +func unsafeMapToG2Complement(pt *pointE2, seed []byte) bool { + res := C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) return int(res) == blst_valid } diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 5c6aab8313d..1e4413b914c 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -100,8 +100,8 @@ int E1_sum_vector_byte(byte*, const byte*, const int); void G1_mult_gen(E1*, const Fr*); BLST_ERROR E1_read_bytes(E1*, const byte *, const int); void E1_write_bytes(byte *, const E1*); -void unsecure_map_bytes_to_G1(E1*, const byte*, int); -BLST_ERROR unsecure_map_bytes_to_G1complement(E1*, const byte*, int); +void unsafe_map_bytes_to_G1(E1*, const byte*, int); +BLST_ERROR unsafe_map_bytes_to_G1complement(E1*, const byte*, int); // E2 and G2 utilities void E2_set_infty(E2* p); @@ -119,8 +119,8 @@ void E2_add(E2* res, const E2* a, const E2* b); void E2_sum_vector(E2*, const E2*, const int); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); bool_t E2_in_G2(const E2*); -void unsecure_map_bytes_to_G2(E2*, const byte*, int); -BLST_ERROR unsecure_map_bytes_to_G2complement(E2*, const byte*, int); +void unsafe_map_bytes_to_G2(E2*, const byte*, int); +BLST_ERROR unsafe_map_bytes_to_G2complement(E2*, const byte*, int); // Utility functions ctx_t* relic_init_BLS12_381(); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 10db3d57714..ae1b240d8ae 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -103,28 +103,28 @@ func TestSubgroupCheck(t *testing.T) { t.Run("G1", func(t *testing.T) { var p pointE1 - unsecureMapToG1(&p, seed) // point in G1 + unsafeMapToG1(&p, seed) // point in G1 assert.True(t, checkMembershipG1(&p)) inG1 := false for !inG1 { _, err := prg.Read(seed) require.NoError(t, err) - inG1 = unsecureMapToG1Complement(&p, seed) // point in E2\G2 + inG1 = unsafeMapToG1Complement(&p, seed) // point in E2\G2 } assert.False(t, checkMembershipG1(&p)) }) t.Run("G2", func(t *testing.T) { var p pointE2 - unsecureMapToG2(&p, seed) // point in G2 + unsafeMapToG2(&p, seed) // point in G2 assert.True(t, checkMembershipG2(&p)) inG2 := false for !inG2 { _, err := prg.Read(seed) require.NoError(t, err) - inG2 = unsecureMapToG2Complement(&p, seed) // point in E2\G2 + inG2 = unsafeMapToG2Complement(&p, seed) // point in E2\G2 } assert.False(t, checkMembershipG2(&p)) }) @@ -138,7 +138,7 @@ func BenchmarkSubgroupCheck(b *testing.B) { b.Run("G1", func(b *testing.B) { var p pointE1 - unsecureMapToG1(&p, seed) // point in G1 + unsafeMapToG1(&p, seed) // point in G1 b.ResetTimer() for i := 0; i < b.N; i++ { _ = checkMembershipG1(&p) // G1 @@ -148,7 +148,7 @@ func BenchmarkSubgroupCheck(b *testing.B) { b.Run("G2", func(b *testing.B) { var p pointE2 - unsecureMapToG2(&p, seed) // point in G2 + unsafeMapToG2(&p, seed) // point in G2 b.ResetTimer() for i := 0; i < b.N; i++ { _ = checkMembershipG2(&p) // G2 @@ -170,7 +170,7 @@ func TestReadWriteG1(t *testing.T) { for i := 0; i < iterations; i++ { var p, q pointE1 _, err := prg.Read(seed) - unsecureMapToG1(&p, seed) + unsafeMapToG1(&p, seed) require.NoError(t, err) writePointE1(bytes, &p) err = readPointE1(&q, bytes) @@ -183,7 +183,7 @@ func TestReadWriteG1(t *testing.T) { for i := 0; i < iterations; i++ { var p, q pointE1 seed := make([]byte, frBytesLen) - unsecureMapToG1(&p, seed) // this results in the infinity point + unsafeMapToG1(&p, seed) // this results in the infinity point writePointE1(bytes, &p) require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check err := readPointE1(&q, bytes) From c47c3211e60da57086b7a0ee46d549834d10071b Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 15 May 2023 20:20:03 -0600 Subject: [PATCH 094/200] fix node info comparison bug in test --- model/bootstrap/node_info.go | 12 ++++++++++++ model/bootstrap/node_info_test.go | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/model/bootstrap/node_info.go b/model/bootstrap/node_info.go index cdc6f855c4a..62a33f6f442 100644 --- a/model/bootstrap/node_info.go +++ b/model/bootstrap/node_info.go @@ -174,6 +174,18 @@ type decodableNodeInfoPub struct { Stake uint64 } +func (info *NodeInfoPub) Equals(other *NodeInfoPub) bool { + if other == nil { + return false + } + return info.Address == other.Address && + info.NodeID == other.NodeID && + info.Role == other.Role && + info.Weight == other.Weight && + info.NetworkPubKey.PublicKey.Equals(other.NetworkPubKey.PublicKey) && + info.StakingPubKey.PublicKey.Equals(other.StakingPubKey.PublicKey) +} + func (info *NodeInfoPub) UnmarshalJSON(b []byte) error { var decodable decodableNodeInfoPub err := json.Unmarshal(b, &decodable) diff --git a/model/bootstrap/node_info_test.go b/model/bootstrap/node_info_test.go index 536c0c808f9..39294de5f69 100644 --- a/model/bootstrap/node_info_test.go +++ b/model/bootstrap/node_info_test.go @@ -50,7 +50,7 @@ func TestNodeInfoPubEncodingJSON(t *testing.T) { var dec bootstrap.NodeInfoPub err = json.Unmarshal(enc, &dec) require.NoError(t, err) - assert.Equal(t, conf, dec) + assert.True(t, dec.Equals(&conf)) }) t.Run("compat: should accept old files using Stake field", func(t *testing.T) { conf := unittest.NodeInfoFixture().Public() @@ -61,6 +61,6 @@ func TestNodeInfoPubEncodingJSON(t *testing.T) { var dec bootstrap.NodeInfoPub err = json.Unmarshal(enc, &dec) require.NoError(t, err) - assert.Equal(t, conf, dec) + assert.True(t, dec.Equals(&conf)) }) } From 184a49d110dbc830af1c05d8a3a6d85dcfaab148 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 16 May 2023 11:33:53 -0600 Subject: [PATCH 095/200] fix public key comparison bugs in tests --- engine/access/access_test.go | 3 --- model/encodable/keys_test.go | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/engine/access/access_test.go b/engine/access/access_test.go index a2af4f64481..768cc9b0ee2 100644 --- a/engine/access/access_test.go +++ b/engine/access/access_test.go @@ -562,15 +562,12 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { for i, serviceEvent := range executionResult.ServiceEvents { assert.Equal(suite.T(), serviceEvent.Type.String(), er.ServiceEvents[i].Type) event := serviceEvent.Event - marshalledEvent, err := json.Marshal(event) require.NoError(suite.T(), err) - assert.Equal(suite.T(), marshalledEvent, er.ServiceEvents[i].Payload) } parsedExecResult, err := convert.MessageToExecutionResult(resp.ExecutionResult) require.NoError(suite.T(), err) - assert.Equal(suite.T(), parsedExecResult, executionResult) assert.Equal(suite.T(), parsedExecResult.ID(), executionResult.ID()) } diff --git a/model/encodable/keys_test.go b/model/encodable/keys_test.go index ccdf63cd044..338c1708366 100644 --- a/model/encodable/keys_test.go +++ b/model/encodable/keys_test.go @@ -247,7 +247,7 @@ func TestEncodableRandomBeaconPrivKeyMsgPack(t *testing.T) { err = key.UnmarshalMsgpack(b) require.NoError(t, err) - require.Equal(t, oldPubKey, key.PublicKey) + require.True(t, oldPubKey.Equals(key.PublicKey)) } func generateRandomSeed(t *testing.T) []byte { From a4fb4357656dd6fbbc5e101cda001471a3445c81 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 16 May 2023 12:28:08 -0600 Subject: [PATCH 096/200] yet another key comparison bug in tests --- model/flow/identity.go | 13 +++++++++++++ model/flow/identity_test.go | 8 ++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/model/flow/identity.go b/model/flow/identity.go index f05188988e6..eb86279641b 100644 --- a/model/flow/identity.go +++ b/model/flow/identity.go @@ -61,6 +61,19 @@ type Identity struct { NetworkPubKey crypto.PublicKey } +func (id *Identity) Equals(other *Identity) bool { + if other == nil { + return false + } + return id.NodeID == other.NodeID && + id.Address == other.Address && + id.Role == other.Role && + id.Weight == other.Weight && + id.Ejected == other.Ejected && + id.StakingPubKey.Equals(other.StakingPubKey) && + id.NetworkPubKey.Equals(other.NetworkPubKey) +} + // ParseIdentity parses a string representation of an identity. func ParseIdentity(identity string) (*Identity, error) { diff --git a/model/flow/identity_test.go b/model/flow/identity_test.go index 9c1a137d8ab..0f3b2c2145a 100644 --- a/model/flow/identity_test.go +++ b/model/flow/identity_test.go @@ -60,7 +60,7 @@ func TestIdentityEncodingJSON(t *testing.T) { var dec flow.Identity err = json.Unmarshal(enc, &dec) require.NoError(t, err) - require.Equal(t, identity, &dec) + require.True(t, identity.Equals(&dec)) }) t.Run("empty address should be omitted", func(t *testing.T) { @@ -73,7 +73,7 @@ func TestIdentityEncodingJSON(t *testing.T) { var dec flow.Identity err = json.Unmarshal(enc, &dec) require.NoError(t, err) - require.Equal(t, identity, &dec) + require.True(t, identity.Equals(&dec)) }) t.Run("compat: should accept old files using Stake field", func(t *testing.T) { @@ -85,7 +85,7 @@ func TestIdentityEncodingJSON(t *testing.T) { var dec flow.Identity err = json.Unmarshal(enc, &dec) require.NoError(t, err) - require.Equal(t, identity, &dec) + require.True(t, identity.Equals(&dec)) }) } @@ -96,7 +96,7 @@ func TestIdentityEncodingMsgpack(t *testing.T) { var dec flow.Identity err = msgpack.Unmarshal(enc, &dec) require.NoError(t, err) - require.Equal(t, identity, &dec) + require.True(t, identity.Equals(&dec)) } func TestIdentityList_Exists(t *testing.T) { From ff5a0c7d1ec3fb682836056b218f4d0214e6587e Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 16 May 2023 14:11:21 -0600 Subject: [PATCH 097/200] another bug: compare ER bassed on IDs and not the in-mem struct --- state/protocol/badger/snapshot_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/state/protocol/badger/snapshot_test.go b/state/protocol/badger/snapshot_test.go index 93c72cbeb9e..01c50b94336 100644 --- a/state/protocol/badger/snapshot_test.go +++ b/state/protocol/badger/snapshot_test.go @@ -832,7 +832,7 @@ func TestLatestSealedResult(t *testing.T) { expectedResult, expectedSeal, err := rootSnapshot.SealedResult() require.NoError(t, err) - assert.Equal(t, expectedResult, gotResult) + assert.Equal(t, expectedResult.ID(), gotResult.ID()) assert.Equal(t, expectedSeal, gotSeal) }) }) From ee91d4f14e512a5303e71b8bdd822b515c519f1c Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 23 May 2023 16:07:46 -0600 Subject: [PATCH 098/200] use BLST multi_pairing to verify BLS signature with many messages --- crypto/bls_core.c | 106 +++++++++++++-------------------------------- crypto/bls_test.go | 2 +- 2 files changed, 32 insertions(+), 76 deletions(-) diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 30f25419aec..4c73e1131b2 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -89,75 +89,53 @@ int bls_verifyPerDistinctMessage(const byte* sig, int ret = UNDEFINED; // return value - ep_t* elemsG1 = (ep_t*)malloc((nb_hashes + 1) * sizeof(ep_t)); + E1* elemsG1 = (E1*)malloc((nb_hashes + 1) * sizeof(E1)); if (!elemsG1) goto outG1; - ep2_t* elemsG2 = (ep2_t*)malloc((nb_hashes + 1) * sizeof(ep2_t)); + E2* elemsG2 = (E2*)malloc((nb_hashes + 1) * sizeof(E2)); if (!elemsG2) goto outG2; - for (int i=0; i < nb_hashes+1; i++) { - ep_new(elemsG1[i]); - ep2_new(elemsG2[i]); - } - // elemsG1[0] = sig - E1 s; - if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) { + if (E1_read_bytes(&elemsG1[0], sig, SIGNATURE_LEN) != BLST_SUCCESS) { ret = INVALID; goto out; } - // check s is in G1 - if (!E1_in_G1(&s)) goto out; - ep_st* s_tmp = E1_blst_to_relic(&s); - ep_copy(elemsG1[0], s_tmp); + // check signature is in G1 + if (!E1_in_G1(&elemsG1[0])) { + ret = INVALID; + goto out; + } // elemsG2[0] = -g2 - ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded + E2_copy(&elemsG2[0], BLS12_381_minus_g2); // map all hashes to G1 int offset = 0; for (int i=1; i < nb_hashes+1; i++) { // elemsG1[i] = h // hash to G1 - E1 h; - map_to_G1(&h, &hashes[offset], len_hashes[i-1]); - ep_st* h_tmp = (ep_st*) E1_blst_to_relic(&h); - ep_copy(elemsG1[i], h_tmp); + map_to_G1(&elemsG1[i], &hashes[offset], len_hashes[i-1]); offset += len_hashes[i-1]; } // aggregate public keys mapping to the same hash offset = 0; - E2 tmp; for (int i=1; i < nb_hashes+1; i++) { // elemsG2[i] = agg_pk[i] - E2_sum_vector(&tmp, &pks[offset] , pks_per_hash[i-1]); - ep2_st* relic_tmp = E2_blst_to_relic(&tmp); - ep2_copy(elemsG2[i], relic_tmp); - free(relic_tmp); + E2_sum_vector(&elemsG2[i], &pks[offset] , pks_per_hash[i-1]); offset += pks_per_hash[i-1]; } - fp12_t pair; - fp12_new(&pair); - // double pairing with Optimal Ate - pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), nb_hashes+1); - - // compare the result to 1 - int cmp_res = fp12_cmp_dig(pair, 1); - - if (core_get()->code == RLC_OK) { - if (cmp_res == RLC_EQ) ret = VALID; - else ret = INVALID; + // multi pairing + Fp12 e; + multi_pairing(&e, elemsG1 , elemsG2, nb_hashes+1); + if (Fp12_is_one(&e)) { + ret = VALID; } else { - ret = UNDEFINED; + ret = INVALID; } out: - for (int i=0; i < nb_hashes+1; i++) { - ep_free(elemsG1[i]); - ep2_free(elemsG2[i]); - } free(elemsG2); outG2: free(elemsG1); @@ -185,38 +163,29 @@ int bls_verifyPerDistinctKey(const byte* sig, int ret = UNDEFINED; // return value - ep_t* elemsG1 = (ep_t*)malloc((nb_pks + 1) * sizeof(ep_t)); + E1* elemsG1 = (E1*)malloc((nb_pks + 1) * sizeof(E1)); if (!elemsG1) goto outG1; - ep2_t* elemsG2 = (ep2_t*)malloc((nb_pks + 1) * sizeof(ep2_t)); + E2* elemsG2 = (E2*)malloc((nb_pks + 1) * sizeof(E2)); if (!elemsG2) goto outG2; - for (int i=0; i < nb_pks+1; i++) { - ep_new(elemsG1[i]); - ep2_new(elemsG2[i]); - } // elemsG1[0] = s - E1 s; - if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) { + if (E1_read_bytes(&elemsG1[0], sig, SIGNATURE_LEN) != BLST_SUCCESS) { ret = INVALID; goto out; } // check s in G1 - if (!E1_in_G1(&s)){ + if (!E1_in_G1(&elemsG1[0])){ ret = INVALID; goto out; - } - ep_st* s_tmp = E1_blst_to_relic(&s); - ep_copy(elemsG1[0], s_tmp); + } // elemsG2[0] = -g2 - ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded + E2_copy(&elemsG2[0], BLS12_381_minus_g2); // set the public keys for (int i=1; i < nb_pks+1; i++) { - ep2_st* tmp = E2_blst_to_relic(&pks[i-1]); - ep2_copy(elemsG2[i], tmp); - free(tmp); + E2_copy(&elemsG2[i], &pks[i-1]); } // map all hashes to G1 and aggregate the ones with the same public key @@ -246,34 +215,21 @@ int bls_verifyPerDistinctKey(const byte* sig, index_offset++; } // aggregate all the points of the array - E1 sum; - E1_sum_vector(&sum, tmp_hashes, hashes_per_pk[i-1]); - ep_st* sum_tmp = E1_blst_to_relic(&sum); - ep_copy(elemsG1[i], sum_tmp); + E1_sum_vector(&elemsG1[i], tmp_hashes, hashes_per_pk[i-1]); } - for (int i=0; icode == RLC_OK) { - if (cmp_res == RLC_EQ) ret = VALID; - else ret = INVALID; + if (Fp12_is_one(&e)) { + ret = VALID; } else { - ret = UNDEFINED; + ret = INVALID; } out: - for (int i=0; i < nb_pks+1; i++) { - ep_free(elemsG1[i]); - ep2_free(elemsG2[i]); - } free(elemsG2); outG2: free(elemsG1); diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 7de6af89325..613e68c07a5 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -900,7 +900,7 @@ func BenchmarkBatchVerify(b *testing.B) { // // Aggregate n signatures of distinct messages under different keys, // and verify the aggregated signature using the multi-signature verification with -// many message. +// many messages. func TestBLSAggregateSignaturesManyMessages(t *testing.T) { rand := getPRG(t) // number of signatures to aggregate From e84403d6489c3d60eff61c986ebf9df00c336068 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 23 May 2023 16:47:48 -0600 Subject: [PATCH 099/200] add test calling multi_pairing with length covering many values mod N_MAX --- crypto/bls12381_utils.c | 13 ------------ crypto/bls12381_utils.h | 1 - crypto/bls_test.go | 46 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 4dbd09a5886..2b51739cb81 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1445,19 +1445,6 @@ void multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) { final_exp(res_vec, res_vec); } -// TODO: remove -void test_pairing(const E1* h, const E1* s, const E2* pk) { - Fp12 e1, e2, e3; - e(&e1, h, pk); - Fp12_print_("e1", &e1); - e(&e2, s, BLS12_381_minus_g2); - Fp12_print_("e2", &e2); - Fp12_mult(&e3, &e2, &e1); - Fp12_print_("e3", &e3); -} - - - // This is a testing function. // It wraps a call to a Relic macro since cgo can't call macros. void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){ diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 2242e38edfe..c2b7c664cbd 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -130,7 +130,6 @@ void Fp12_set_one(Fp12*); void Fp12_inv(Fp12*); // TODO: remove void Fp12_mult(Fp12*, const Fp12*, const Fp12*); // TODO: remove void multi_pairing(Fp12*, const E1*, const E2*, const int); -void test_pairing(const E1*, const E1*, const E2*); // TODO: remove // Utility functions ctx_t* relic_init_BLS12_381(); diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 613e68c07a5..c3abbcfb673 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -904,10 +904,10 @@ func BenchmarkBatchVerify(b *testing.B) { func TestBLSAggregateSignaturesManyMessages(t *testing.T) { rand := getPRG(t) // number of signatures to aggregate - sigsNum := rand.Intn(20) + 1 + sigsNum := rand.Intn(40) + 1 sigs := make([]Signature, 0, sigsNum) - // number of keys + // number of keys (less than the number of signatures) keysNum := rand.Intn(sigsNum) + 1 sks := make([]PrivateKey, 0, keysNum) // generate the keys @@ -983,8 +983,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err := VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, valid, - "verification should fail with an empty key list") + assert.False(t, valid, "verification should fail with an empty key list") }) // test inconsistent input arrays @@ -1019,6 +1018,45 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { assert.False(t, valid, "verification should fail with nil hasher") inputPks[0] = tmpPK }) + + t.Run("variable number of distinct keys and messages", func(t *testing.T) { + // use a specific PRG for easier reproduction + prg := getPRG(t) + // number of signatures to aggregate + N := 100 + sigs := make([]Signature, 0, N) + msgs := make([][]byte, 0, N) + pks := make([]PublicKey, 0, N) + kmacs := make([]hash.Hasher, 0, N) + kmac := NewExpandMsgXOFKMAC128("test tag") + for i := 0; i < N; i++ { + // distinct message + msg := make([]byte, 20) + msgs = append(msgs, msg) + _, err := prg.Read(msg) + require.NoError(t, err) + // distinct key + sk := randomSK(t, prg) + pks = append(pks, sk.PublicKey()) + // generate a signature + s, err := sk.Sign(msg, kmac) + require.NoError(t, err) + sigs = append(sigs, s) + kmacs = append(kmacs, kmac) + } + + // go through all numbers of couples (msg, key) + for i := 1; i < N; i++ { + // aggregate signatures + var err error + aggSig, err = AggregateBLSSignatures(sigs[:i]) + require.NoError(t, err) + // Verify the aggregated signature + valid, err := VerifyBLSSignatureManyMessages(pks[:i], aggSig, msgs[:i], kmacs[:i]) + require.NoError(t, err, "verification errored with %d couples (msg,key)", i) + assert.True(t, valid, "verification failed with %d couples (msg,key)", i) + } + }) } // TestBLSErrorTypes verifies working of error-type-detecting functions From 9167d9e3604f0c4e3f13fc300e588b45de774c7d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 23 May 2023 16:57:18 -0600 Subject: [PATCH 100/200] use BLST multi_pairing to verify BLS SPoCK --- crypto/bls12381_utils.c | 76 +++++++++++------------------------------ 1 file changed, 19 insertions(+), 57 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 2b51739cb81..05124d81092 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1189,79 +1189,41 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){ // the membership check in G2 is separated to allow optimizing multiple verifications // using the same public keys. int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) { - ep_t elemsG1[2]; - ep2_t elemsG2[2]; - ep_new(elemsG1[0]); - ep_new(elemsG1[1]); - ep2_new(elemsG2[1]); - ep2_new(elemsG2[0]); - int ret = UNDEFINED; + E1 elemsG1[2]; + E2 elemsG2[2]; // elemsG1[0] = s1 - E1 s; - if (E1_read_bytes(&s, sig1, SIGNATURE_LEN) != BLST_SUCCESS) { - ret = INVALID; - goto out; + if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != BLST_SUCCESS) { + return INVALID; }; // check s1 is in G1 - if (!E1_in_G1(&s)) { - ret = INVALID; - goto out; + if (!E1_in_G1(&elemsG1[0])) { + return INVALID; } - ep_st* s_tmp = E1_blst_to_relic(&s); - ep_copy(elemsG1[0], s_tmp); // elemsG1[1] = s2 - if (E1_read_bytes(&s, sig2, SIGNATURE_LEN) != BLST_SUCCESS) { - ret = INVALID; - goto out; + if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != BLST_SUCCESS) { + return INVALID; }; // check s2 is in G1 - if (!E1_in_G1(&s)) { - ret = INVALID; - goto out; + if (!E1_in_G1(&elemsG1[1])) { + return INVALID; } - s_tmp = E1_blst_to_relic(&s); - ep_copy(elemsG1[1], s_tmp); // elemsG2[1] = pk1 - ep2_st* pk_tmp = E2_blst_to_relic(pk1); - ep2_copy(elemsG2[1], pk_tmp); - - // elemsG2[0] = pk2 - pk_tmp = E2_blst_to_relic(pk2); - ep2_copy(elemsG2[0], pk_tmp); - free(pk_tmp); - free(s_tmp); + E2_copy(&elemsG2[1], pk1); // elemsG2[0] = -pk2 - ep2_neg(elemsG2[0], elemsG2[0]); + E2_neg(&elemsG2[0], pk2); - fp12_t pair; - fp12_new(&pair); - // double pairing with Optimal Ate - pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), 2); + // double pairing + Fp12 e; + multi_pairing(&e, elemsG1 , elemsG2, 2); - // compare the result to 1 - int res = fp12_cmp_dig(pair, 1); - fp12_free(pair); - - if (core_get()->code == RLC_OK) { - if (res == RLC_EQ) { - ret = VALID; - } - else { - ret = INVALID; - } - goto out; - } - -out: - ep_free(elemsG1[0]); - ep_free(elemsG1[1]); - ep2_free(elemsG2[0]); - ep2_free(elemsG2[1]); - return ret; + if (Fp12_is_one(&e)) { + return VALID; + } + return INVALID; } // Subtracts all G2 array elements `y` from an element `x` and writes the From cf0aa6b84396c56cdfce5764fa60ba4849639a37 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 23 May 2023 17:16:56 -0600 Subject: [PATCH 101/200] clean up Fp12 tools --- crypto/bls12381_utils.c | 14 -------------- crypto/bls12381_utils.h | 2 -- 2 files changed, 16 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 05124d81092..34a84fb629b 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1317,25 +1317,11 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { // ------------------- Pairing utilities bool_t Fp12_is_one(Fp12 *a) { - //return vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & - // vec_is_zero(a[0][1], sizeof(a) - sizeof(a[0][0])); return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12)); } void Fp12_set_one(Fp12 *a) { vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12)); - //vec_copy(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])); - //vec_zero(a[0][1], sizeof(a) - sizeof(a[0][0])); -} - -// TODO: remove -void Fp12_inv(Fp12 *a) { - conjugate_fp12((vec384fp6*)a); -} - -// TODO: remove -void Fp12_mult(Fp12* ret, const Fp12* a, const Fp12* b){ - mul_fp12((vec384fp6*)ret, (vec384fp6*)a, (vec384fp6*)b); } static void e(Fp12* res, const E1* p, const E2* q) { diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index c2b7c664cbd..c1914fbedfd 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -127,8 +127,6 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2*, const byte*, int); // pairing and Fp12 bool_t Fp12_is_one(Fp12*); void Fp12_set_one(Fp12*); -void Fp12_inv(Fp12*); // TODO: remove -void Fp12_mult(Fp12*, const Fp12*, const Fp12*); // TODO: remove void multi_pairing(Fp12*, const E1*, const E2*, const int); // Utility functions From 98152ba0649638b8198a10482422f5322c9ad54c Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 23 May 2023 17:29:46 -0600 Subject: [PATCH 102/200] clean up Relic tools --- crypto/bls.go | 7 - crypto/bls12381_utils.c | 271 +-------------------------------------- crypto/bls12381_utils.go | 15 --- crypto/bls12381_utils.h | 24 +--- crypto/bls_core.c | 2 - 5 files changed, 9 insertions(+), 310 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 8cfd435b380..c4be5a3aa85 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -85,8 +85,6 @@ var expandMsgOutput = int(C.get_mapToG1_input_len()) // blsBLS12381Algo, embeds SignAlgo type blsBLS12381Algo struct { - // points to Relic context of BLS12-381 with all the parameters - context ctx // the signing algo and parameters algo SigningAlgorithm } @@ -535,11 +533,6 @@ var prKeyLengthBLSBLS12381 = int(C.get_sk_len()) // init sets the context of BLS12-381 curve func (a *blsBLS12381Algo) init() error { - // initializes relic context and sets the B12_381 parameters - if err := a.context.initContext(); err != nil { - return err - } - // compare the Go and C layer constants as a sanity check if signatureLengthBLSBLS12381 != SignatureLenBLSBLS12381 || pubKeyLengthBLSBLS12381 != PubKeyLenBLSBLS12381 || diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 34a84fb629b..7055a7efa1b 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -8,6 +8,7 @@ #include "bls_include.h" #include "assert.h" +// compile all blst C src along with this file #include "blst_src.c" // The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) @@ -29,33 +30,6 @@ int get_mapToG1_input_len() { return MAP_TO_G1_INPUT_LEN; } - -// Initializes Relic context with BLS12-381 parameters -ctx_t* relic_init_BLS12_381() { - // check Relic was compiled with the right conf - assert(ALLOC == AUTO); - - // sanity check of Relic constants the package is relying on - assert(RLC_OK == RLC_EQ); - - // initialize relic core with a new context - ctx_t* bls_ctx = (ctx_t*) calloc(1, sizeof(ctx_t)); - if (!bls_ctx) return NULL; - core_set(bls_ctx); - if (core_init() != RLC_OK) return NULL; - - // init BLS curve - int ret = RLC_OK; - #if (FP_PRIME == 381) - ret = ep_param_set_any_pairf(); // sets B12_P381 if FP_PRIME = 381 in relic config - #else - ep_param_set(B12_P381); - ep2_curve_set_twist(EP_MTYPE); // Multiplicative twist - #endif - if (ret != RLC_OK) return NULL; - return core_get(); -} - // ------------------- Fr utilities // Montgomery constant R related to the curve order r @@ -397,44 +371,6 @@ void Fp_write_bytes(byte *bin, const Fp* a) { be_bytes_from_limbs(bin, (limb_t*)a, Fp_BYTES); } -// fp_read_bin_safe is a modified version of Relic's (void fp_read_bin). -// It reads a field element from a buffer and makes sure the big number read can be -// written as a field element (is reduced modulo p). -// Unlike Relic's versions, the function does not reduce the read integer modulo p and does -// not throw an exception for an integer larger than p. The function returns RLC_OK if the input -// corresponds to a field element, and returns RLC_ERR otherwise. -static int fp_read_bin_safe(fp_t a, const byte *bin, int len) { - if (len != Fp_BYTES) { - return RLC_ERR; - } - - int ret = RLC_ERR; - bn_t t; - bn_new(t); - bn_read_bin(t, bin, Fp_BYTES); - - // make sure read bn is reduced modulo p - // first check is sanity check, since current implementation of `bn_read_bin` insures - // output bn is positive - if (bn_sign(t) == RLC_NEG || bn_cmp(t, &core_get()->prime) != RLC_LT) { - goto out; - } - - if (bn_is_zero(t)) { - fp_zero(a); - } else { - if (t->used == 1) { - fp_prime_conv_dig(a, t->dp[0]); - } else { - fp_prime_conv(a, t); - } - } - ret = RLC_OK; -out: - bn_free(t); - return ret; -} - // returns the sign of y. // 1 if y > (p - 1)/2 and 0 otherwise. // y is in montgomery form @@ -524,81 +460,6 @@ void Fp2_write_bytes(byte *bin, const Fp2* a) { // ------------------- E1 utilities -// TODO: to delete, only used by temporary E2_blst_to_relic -static int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { - // check the length - const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); - if (len!=G1_size) { - return RLC_ERR; - } - - // check the compression bit - int compressed = bin[0] >> 7; - if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { - return RLC_ERR; - } - - // check if the point is infinity - int is_infinity = bin[0] & (1<<6); - if (is_infinity) { - // check if the remaining bits are cleared - if (bin[0] & 0x3F) { - return RLC_ERR; - } - for (int i=1; i> 5) & 1; - if (y_sign && (!compressed)) { - return RLC_ERR; - } - - a->coord = BASIC; - fp_set_dig(a->z, 1); - // use a temporary buffer to mask the header bits and read a.x - byte temp[Fp_BYTES]; - memcpy(temp, bin, Fp_BYTES); - temp[0] &= 0x1F; - if (fp_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) { - return RLC_ERR; - } - - if (G1_SERIALIZATION == UNCOMPRESSED) { - if (fp_read_bin_safe(a->y, bin + Fp_BYTES, Fp_BYTES) != RLC_OK) { - return RLC_ERR; - } - // check read point is on curve - if (!ep_on_curve(a)) { - return RLC_ERR; - } - return RLC_OK; - } - fp_zero(a->y); - fp_set_bit(a->y, 0, y_sign); - if (ep_upk(a, a) == 1) { - // resulting point is guaranteed to be on curve - return RLC_OK; - } - return RLC_ERR; -} - -// TODO: temp utility function to delete -ep_st* E1_blst_to_relic(const E1* x) { - ep_st* out = (ep_st*)malloc(sizeof(ep_st)); - byte* data = (byte*)malloc(G1_SER_BYTES); - E1_write_bytes(data, x); - ep_read_bin_compact(out, data, G1_SER_BYTES); - free(data); - return out; -} - void E1_copy(E1* res, const E1* p) { if ((uptr_t)p == (uptr_t)res) { return; @@ -872,97 +733,6 @@ const E1* BLS12_381_g1 = (const E1*)&BLS12_381_G1; /// TODO:delete const E2* BLS12_381_g2 = (const E2*)&BLS12_381_G2; const E2* BLS12_381_minus_g2 = (const E2*)&BLS12_381_NEG_G2; -// TODO: to delete -static int fp2_read_bin_safe(fp2_t a, const byte *bin, int len) { - if (len != Fp2_BYTES) { - return RLC_ERR; - } - if (fp_read_bin_safe(a[0], bin, Fp_BYTES) != RLC_OK) { - return RLC_ERR; - } - if (fp_read_bin_safe(a[1], bin + Fp_BYTES, Fp_BYTES) != RLC_OK) { - return RLC_ERR; - } - return RLC_OK; -} - -// TODO: to delete, only used by temporary E2_blst_to_relic -static int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) { - // check the length - const int G2size = (G2_BYTES/(G2_SERIALIZATION+1)); - if (len!=G2size) { - return RLC_ERR; - } - - // check the compression bit - int compressed = bin[0] >> 7; - if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { - return RLC_ERR; - } - - // check if the point in infinity - int is_infinity = bin[0] & 0x40; - if (is_infinity) { - // the remaining bits need to be cleared - if (bin[0] & 0x3F) { - return RLC_ERR; - } - for (int i=1; i> 5) & 1; - if (y_sign && (!compressed)) { - return RLC_ERR; - } - - a->coord = BASIC; - fp2_set_dig(a->z, 1); // a.z - // use a temporary buffer to mask the header bits and read a.x - byte temp[Fp2_BYTES]; - memcpy(temp, bin, Fp2_BYTES); - temp[0] &= 0x1F; // clear the header bits - if (fp2_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) { - return RLC_ERR; - } - - if (G2_SERIALIZATION == UNCOMPRESSED) { - if (fp2_read_bin_safe(a->y, bin + Fp2_BYTES, Fp2_BYTES) != RLC_OK){ - return RLC_ERR; - } - // check read point is on curve - if (!ep2_on_curve(a)) { - return RLC_ERR; - } - return RLC_OK; - } - - fp2_zero(a->y); - fp_set_bit(a->y[0], 0, y_sign); - fp_zero(a->y[1]); - if (ep2_upk(a, a) == 1) { - // resulting point is guaranteed to be on curve - return RLC_OK; - } - return RLC_ERR; -} - -// TODO: temp utility function to delete -ep2_st* E2_blst_to_relic(const E2* x) { - ep2_st* out = (ep2_st*)malloc(sizeof(ep2_st)); - byte* data = (byte*)malloc(G2_SER_BYTES); - E2_write_bytes(data, x); - ep2_read_bin_compact(out, data, G2_SER_BYTES); - free(data); - return out; -} - // E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or uncompressed form. // The resulting point is guaranteed to be on curve E2 (no G2 check is included). // @@ -1324,13 +1094,6 @@ void Fp12_set_one(Fp12 *a) { vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12)); } -static void e(Fp12* res, const E1* p, const E2* q) { - E1 p_aff; E1_to_affine(&p_aff, p); - E2 q_aff; E2_to_affine(&q_aff, q); - miller_loop_n((vec384fp6*)res, (POINTonE2_affine*)&q_aff, (POINTonE1_affine*)&p_aff, 1); - final_exp((vec384fp6*)res, (vec384fp6*)res); -} - // computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) // by optimizing a common final exponentiation for all pairings. // result is stored in `res`. @@ -1401,8 +1164,7 @@ void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int // DEBUG printing functions -#define DEBUG 1 -#if DEBUG==1 +#if (DEBUG == 1) void bytes_print_(char* s, byte* data, int len) { if (strlen(s)) printf("[%s]:\n", s); for (int i=0; ileft) { // no need to check the right child for the leaf check because // the recursive build starts with the left side first - // relic free - if (root->sig) ep_free(root->sig); // pointer free free(root->sig); free(root->pk); From b6b90d61c131db2451dfc5f6d262bc6a6028b1ab Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 23 May 2023 17:39:43 -0600 Subject: [PATCH 103/200] uncomment a test --- crypto/bls_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/bls_test.go b/crypto/bls_test.go index c3abbcfb673..377683addf2 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -19,7 +19,7 @@ import ( // TestBLSMainMethods is a sanity check of main signature scheme methods (keyGen, sign, verify) func TestBLSMainMethods(t *testing.T) { // test the key generation seed lengths - //testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen) + testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen) // test the consistency with different inputs hasher := NewExpandMsgXOFKMAC128("test tag") testGenSignVerify(t, BLSBLS12381, hasher) From 3ec1ecfdda9d6a52bb8128346e4f5fabb2675bf3 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 14:18:07 -0600 Subject: [PATCH 104/200] remove relic tag and delete non-needed files --- crypto/bls.go | 15 --- crypto/bls12381_utils.c | 2 - crypto/bls12381_utils.go | 3 - crypto/bls12381_utils.h | 2 - crypto/bls12381_utils_test.go | 3 - crypto/bls_core.c | 2 - crypto/bls_crossBLST_test.go | 3 - crypto/bls_include.h | 2 - crypto/bls_multisig.go | 3 - crypto/bls_no_relic.go | 156 ----------------------------- crypto/bls_no_relic_test.go | 42 -------- crypto/bls_test.go | 3 - crypto/bls_thresholdsign.go | 3 - crypto/bls_thresholdsign_core.c | 2 - crypto/bls_thresholdsign_include.h | 2 - crypto/bls_thresholdsign_test.go | 3 - crypto/blst_include.h | 2 - crypto/blst_src/blst_src.c | 2 - crypto/dkg_core.c | 2 - crypto/dkg_feldmanvss.go | 3 - crypto/dkg_feldmanvssq.go | 3 - crypto/dkg_include.h | 2 - crypto/dkg_jointfeldman.go | 3 - crypto/dkg_test.go | 3 - crypto/ecdsa_test.go | 3 - crypto/sign.go | 35 +++---- crypto/sign_norelic.go | 13 --- crypto/sign_relic.go | 42 -------- crypto/spock.go | 3 - crypto/spock_test.go | 3 - 30 files changed, 18 insertions(+), 347 deletions(-) delete mode 100644 crypto/bls_no_relic.go delete mode 100644 crypto/bls_no_relic_test.go delete mode 100644 crypto/sign_norelic.go delete mode 100644 crypto/sign_relic.go diff --git a/crypto/bls.go b/crypto/bls.go index c4be5a3aa85..c2e43aee908 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // BLS signature scheme implementation using BLS12-381 curve @@ -39,7 +36,6 @@ import "C" import ( "bytes" "crypto/sha256" - "errors" "fmt" "golang.org/x/crypto/hkdf" @@ -531,17 +527,6 @@ var signatureLengthBLSBLS12381 = int(C.get_signature_len()) var pubKeyLengthBLSBLS12381 = int(C.get_pk_len()) var prKeyLengthBLSBLS12381 = int(C.get_sk_len()) -// init sets the context of BLS12-381 curve -func (a *blsBLS12381Algo) init() error { - // compare the Go and C layer constants as a sanity check - if signatureLengthBLSBLS12381 != SignatureLenBLSBLS12381 || - pubKeyLengthBLSBLS12381 != PubKeyLenBLSBLS12381 || - prKeyLengthBLSBLS12381 != PrKeyLenBLSBLS12381 { - return errors.New("BLS-12381 length settings in Go and C are not consistent, check hardcoded lengths and compressions") - } - return nil -} - // This is only a TEST function. // signWithXMDSHA256 signs a message using XMD_SHA256 as a hash to field. // diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 7055a7efa1b..4181d69fbc7 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1,5 +1,3 @@ -// +build relic - // this file contains utility functions for the curve BLS 12-381 // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 794be4fb705..ff60bede6d5 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // this file contains utility functions for the curve BLS 12-381 diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 9670d922b15..826872c0e7c 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -1,5 +1,3 @@ -// +build relic - // this file contains utility functions for the curve BLS 12-381 // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index ae1b240d8ae..69d7e687f9b 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( diff --git a/crypto/bls_core.c b/crypto/bls_core.c index e4f1be2dfa4..6711320cf51 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -1,5 +1,3 @@ -// +build relic - #include "bls_include.h" // this file is about the core functions required by the BLS signature scheme diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index ffdb156e251..6d3f1765e25 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // This file contains tests against the library BLST (https://github.com/supranational/blst). diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 3fc56062ab5..7060ac10bdc 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -1,5 +1,3 @@ -// +build relic - // this file is about the core functions required by the BLS signature scheme #ifndef _REL_BLS_INCLUDE_H diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index 0981103120a..aa8e669924a 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( diff --git a/crypto/bls_no_relic.go b/crypto/bls_no_relic.go deleted file mode 100644 index fed6c216398..00000000000 --- a/crypto/bls_no_relic.go +++ /dev/null @@ -1,156 +0,0 @@ -//go:build !relic -// +build !relic - -package crypto - -import ( - "github.com/onflow/flow-go/crypto/hash" -) - -// The functions below are the non-Relic versions of the public APIs -// requiring the Relic library. -// All BLS functionalities in the package require the Relic dependency, -// and therefore the "relic" build tag. -// Building without the "relic" tag is successful, but and calling one of the -// BLS functions results in a runtime panic. This allows projects depending on the -// crypto library to build successfully with or without the "relic" tag. - -const relic_panic = "function is not supported when building without \"relic\" Go build tag" - -const ( - SignatureLenBLSBLS12381 = 48 -) - -// bls.go functions -func NewExpandMsgXOFKMAC128(tag string) hash.Hasher { - panic(relic_panic) -} - -func BLSInvalidSignature() Signature { - panic(relic_panic) -} - -// bls_multisig.go functions -func BLSGeneratePOP(sk PrivateKey) (Signature, error) { - panic(relic_panic) -} - -func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) { - panic(relic_panic) -} - -func AggregateBLSSignatures(sigs []Signature) (Signature, error) { - panic(relic_panic) -} - -func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { - panic(relic_panic) -} - -func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { - panic(relic_panic) -} - -func IdentityBLSPublicKey() PublicKey { - panic(relic_panic) -} - -func IsBLSAggregateEmptyListError(err error) bool { - panic(relic_panic) -} - -func IsInvalidSignatureError(err error) bool { - panic(relic_panic) -} - -func IsNotBLSKeyError(err error) bool { - panic(relic_panic) -} - -func IsBLSSignatureIdentity(s Signature) bool { - panic(relic_panic) -} - -func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) { - panic(relic_panic) -} - -func VerifyBLSSignatureOneMessage(pks []PublicKey, s Signature, - message []byte, kmac hash.Hasher) (bool, error) { - panic(relic_panic) -} - -func VerifyBLSSignatureManyMessages(pks []PublicKey, s Signature, - messages [][]byte, kmac []hash.Hasher) (bool, error) { - panic(relic_panic) -} - -func BatchVerifyBLSSignaturesOneMessage(pks []PublicKey, sigs []Signature, - message []byte, kmac hash.Hasher) ([]bool, error) { - panic(relic_panic) -} - -func SPOCKProve(sk PrivateKey, data []byte, kmac hash.Hasher) (Signature, error) { - panic(relic_panic) -} - -func SPOCKVerifyAgainstData(pk PublicKey, proof Signature, data []byte, kmac hash.Hasher) (bool, error) { - panic(relic_panic) -} - -func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signature) (bool, error) { - panic(relic_panic) -} - -// bls_threshold.go functions -func NewBLSThresholdSignatureParticipant( - groupPublicKey PublicKey, - sharePublicKeys []PublicKey, - threshold int, - myIndex int, - myPrivateKey PrivateKey, - message []byte, - dsTag string, -) (ThresholdSignatureParticipant, error) { - panic(relic_panic) -} - -func NewBLSThresholdSignatureInspector( - groupPublicKey PublicKey, - sharePublicKeys []PublicKey, - threshold int, - message []byte, - dsTag string, -) (ThresholdSignatureInspector, error) { - panic(relic_panic) -} - -func BLSReconstructThresholdSignature(size int, threshold int, - shares []Signature, signers []int) (Signature, error) { - panic(relic_panic) -} - -func EnoughShares(threshold int, sharesNumber int) (bool, error) { - panic(relic_panic) -} - -func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, - []PublicKey, PublicKey, error) { - panic(relic_panic) -} - -// dkg.go functions -func NewFeldmanVSS(size int, threshold int, myIndex int, - processor DKGProcessor, dealerIndex int) (DKGState, error) { - panic(relic_panic) -} - -func NewFeldmanVSSQual(size int, threshold int, myIndex int, - processor DKGProcessor, dealerIndex int) (DKGState, error) { - panic(relic_panic) -} - -func NewJointFeldman(size int, threshold int, myIndex int, - processor DKGProcessor) (DKGState, error) { - panic(relic_panic) -} diff --git a/crypto/bls_no_relic_test.go b/crypto/bls_no_relic_test.go deleted file mode 100644 index 47f8120060f..00000000000 --- a/crypto/bls_no_relic_test.go +++ /dev/null @@ -1,42 +0,0 @@ -//go:build !relic -// +build !relic - -package crypto - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -// Test for all public APIs requiring relic build tag. -// These functions should panic if build without the relic tag. -func TestNoRelicPanic(t *testing.T) { - assert.PanicsWithValue(t, relic_panic, func() { NewExpandMsgXOFKMAC128("") }) - assert.PanicsWithValue(t, relic_panic, func() { BLSInvalidSignature() }) - assert.PanicsWithValue(t, relic_panic, func() { BLSGeneratePOP(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { BLSVerifyPOP(nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSSignatures(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSPrivateKeys(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSPublicKeys(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { IdentityBLSPublicKey() }) - assert.PanicsWithValue(t, relic_panic, func() { IsBLSAggregateEmptyListError(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { IsInvalidSignatureError(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { IsNotBLSKeyError(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { IsBLSSignatureIdentity(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { RemoveBLSPublicKeys(nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { VerifyBLSSignatureOneMessage(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { VerifyBLSSignatureManyMessages(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { BatchVerifyBLSSignaturesOneMessage(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { SPOCKProve(nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { SPOCKVerify(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { SPOCKVerifyAgainstData(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { NewBLSThresholdSignatureParticipant(nil, nil, 0, 0, nil, nil, "") }) - assert.PanicsWithValue(t, relic_panic, func() { NewBLSThresholdSignatureInspector(nil, nil, 0, nil, "") }) - assert.PanicsWithValue(t, relic_panic, func() { BLSReconstructThresholdSignature(0, 0, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { EnoughShares(0, 0) }) - assert.PanicsWithValue(t, relic_panic, func() { BLSThresholdKeyGen(0, 0, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { NewFeldmanVSS(0, 0, 0, nil, 0) }) - assert.PanicsWithValue(t, relic_panic, func() { NewFeldmanVSSQual(0, 0, 0, nil, 0) }) - assert.PanicsWithValue(t, relic_panic, func() { NewJointFeldman(0, 0, 0, nil) }) -} diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 377683addf2..801af0a24a5 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 1d19ca42504..3cef4d4e605 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // #cgo CFLAGS: diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index 027579d3dae..e160a16e7c9 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -1,5 +1,3 @@ -// +build relic - #include "bls_thresholdsign_include.h" // the highest index of a threshold participant diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index ce88c460f95..1275b10bab4 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -1,5 +1,3 @@ -// +build relic - #ifndef _REL_THRESHOLD_INCLUDE_H #define _REL_THRESHOLD_INCLUDE_H diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 9db32e0fe85..3e55f3d1806 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 1f7b2484a3c..e408c9c0c70 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -1,5 +1,3 @@ -// +build relic - #ifndef __BLST_INCLUDE_H__ #define __BLST_INCLUDE_H__ diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c index 4b0732e06e4..b904a5d52ee 100644 --- a/crypto/blst_src/blst_src.c +++ b/crypto/blst_src/blst_src.c @@ -1,5 +1,3 @@ -// +build relic - #include "keygen.c" #include "hash_to_field.c" #include "e1.c" diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 89f09e35da0..2b34572089c 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -1,5 +1,3 @@ -// +build relic - #include "dkg_include.h" // computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 64f2a11c383..dd81bbcd79c 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // #cgo CFLAGS: diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 69393768fe5..620c962faaa 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // #cgo CFLAGS: diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index e8489fbf669..ca6619eb10f 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -1,5 +1,3 @@ -// +build relic - #ifndef _REL_DKG_INCLUDE_H #define _REL_DKG_INCLUDE_H diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index 8de9695a0c5..c4fb23f578e 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // #cgo CFLAGS: diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index 32a1b9982b4..2bd4dc51fa0 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( diff --git a/crypto/ecdsa_test.go b/crypto/ecdsa_test.go index 342162668cf..d5d38f8e947 100644 --- a/crypto/ecdsa_test.go +++ b/crypto/ecdsa_test.go @@ -1,6 +1,3 @@ -//go:build !relic -// +build !relic - package crypto import ( diff --git a/crypto/sign.go b/crypto/sign.go index 68196acba2d..788a55618d4 100644 --- a/crypto/sign.go +++ b/crypto/sign.go @@ -49,20 +49,22 @@ type signer interface { decodePublicKeyCompressed([]byte) (PublicKey, error) } -// newNonRelicSigner returns a signer that does not depend on Relic library. -func newNonRelicSigner(algo SigningAlgorithm) (signer, error) { +// newSigner returns a signer that does not depend on Relic library. +func newSigner(algo SigningAlgorithm) (signer, error) { switch algo { case ECDSAP256: return p256Instance, nil case ECDSASecp256k1: return secp256k1Instance, nil + case BLSBLS12381: + return blsInstance, nil default: return nil, invalidInputsErrorf("the signature scheme %s is not supported", algo) } } // Initialize the context of all algos not requiring Relic -func initNonRelic() { +func init() { // P-256 p256Instance = &(ecdsaAlgo{ curve: elliptic.P256(), @@ -74,19 +76,10 @@ func initNonRelic() { curve: btcec.S256(), algo: ECDSASecp256k1, }) -} -// Signature format Check for non-relic algos (ECDSA) -func signatureFormatCheckNonRelic(algo SigningAlgorithm, s Signature) (bool, error) { - switch algo { - case ECDSAP256: - return p256Instance.signatureFormatCheck(s), nil - case ECDSASecp256k1: - return secp256k1Instance.signatureFormatCheck(s), nil - default: - return false, invalidInputsErrorf( - "the signature scheme %s is not supported", - algo) + // bls12-381 + blsInstance = &blsBLS12381Algo{ + algo: BLSBLS12381, } } @@ -98,8 +91,16 @@ func signatureFormatCheckNonRelic(algo SigningAlgorithm, s Signature) (bool, err // If SignatureFormatCheck returns false then the input is not a valid // signature and will fail a verification against any message and public key. func SignatureFormatCheck(algo SigningAlgorithm, s Signature) (bool, error) { - // For now, signatureFormatCheckNonRelic is only defined for non-Relic algos. - return signatureFormatCheckNonRelic(algo, s) + switch algo { + case ECDSAP256: + return p256Instance.signatureFormatCheck(s), nil + case ECDSASecp256k1: + return secp256k1Instance.signatureFormatCheck(s), nil + default: + return false, invalidInputsErrorf( + "the signature scheme %s is not supported", + algo) + } } // GeneratePrivateKey generates a private key of the algorithm using the entropy of the given seed. diff --git a/crypto/sign_norelic.go b/crypto/sign_norelic.go deleted file mode 100644 index 7e6dd4c0d10..00000000000 --- a/crypto/sign_norelic.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build !relic -// +build !relic - -package crypto - -// newSigner chooses and initializes a signature scheme -func newSigner(algo SigningAlgorithm) (signer, error) { - return newNonRelicSigner(algo) -} - -func init() { - initNonRelic() -} diff --git a/crypto/sign_relic.go b/crypto/sign_relic.go deleted file mode 100644 index 980fca20c51..00000000000 --- a/crypto/sign_relic.go +++ /dev/null @@ -1,42 +0,0 @@ -//go:build relic -// +build relic - -package crypto - -import ( - "fmt" -) - -// newSigner chooses and initializes a signature scheme -func newSigner(algo SigningAlgorithm) (signer, error) { - // try Relic algos - if signer := relicSigner(algo); signer != nil { - return signer, nil - } - // return a non-Relic algo - return newNonRelicSigner(algo) -} - -// relicSigner returns a signer that depends on Relic library. -func relicSigner(algo SigningAlgorithm) signer { - if algo == BLSBLS12381 { - return blsInstance - } - return nil -} - -// Initialize Relic with the BLS context on BLS 12-381 -func init() { - initRelic() - initNonRelic() -} - -// Initialize the context of all algos requiring Relic -func initRelic() { - blsInstance = &blsBLS12381Algo{ - algo: BLSBLS12381, - } - if err := blsInstance.init(); err != nil { - panic(fmt.Sprintf("initialization of BLS failed: %s", err.Error())) - } -} diff --git a/crypto/spock.go b/crypto/spock.go index 4fbd974c27f..46673b0bb13 100644 --- a/crypto/spock.go +++ b/crypto/spock.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // SPoCK design based on the BLS signature scheme. diff --git a/crypto/spock_test.go b/crypto/spock_test.go index 596968234e4..75de3dea838 100644 --- a/crypto/spock_test.go +++ b/crypto/spock_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( From 64112cca8a4c4854ccdbc6d5924214dcfc332d07 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 14:22:00 -0600 Subject: [PATCH 105/200] remove relic build scripts --- crypto/bls12381_utils.go | 4 +- crypto/build_dependency.sh | 36 --------------- crypto/relic_build.sh | 90 -------------------------------------- 3 files changed, 1 insertion(+), 129 deletions(-) delete mode 100644 crypto/build_dependency.sh delete mode 100755 crypto/relic_build.sh diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index ff60bede6d5..d56be090332 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -4,8 +4,7 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -O -D__BLST_PORTABLE__ -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -O -D__BLST_PORTABLE__ -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls12381_utils.h" @@ -37,7 +36,6 @@ import ( ) // Go wrappers around BLST C types -// Go wrappers around Relic C types type pointE1 C.E1 type pointE2 C.E2 type scalar C.Fr diff --git a/crypto/build_dependency.sh b/crypto/build_dependency.sh deleted file mode 100644 index 4bfe99dbad2..00000000000 --- a/crypto/build_dependency.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -PKG_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -RELIC_DIR_NAME="relic" -RELIC_DIR="${PKG_DIR}/${RELIC_DIR_NAME}" - -# grant permissions if not existant -if [[ ! -r ${PKG_DIR} || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then - chmod -R 755 "${PKG_DIR}" -fi - -rm -rf "${RELIC_DIR}" - -# relic version or tag -relic_version="7d885d1ba34be61bf22190943a73549a910c1714" - -# clone a specific version of Relic without history if it's tagged. -# git -c http.sslVerify=true clone --branch $(relic_version) --single-branch --depth 1 https://github.com/relic-toolkit/relic.git ${RELIC_DIR_NAME} || { echo "git clone failed"; exit 1; } - -# clone all the history if the version is only defined by a commit hash. -git -c http.sslVerify=true clone --branch main --single-branch https://github.com/relic-toolkit/relic.git ${RELIC_DIR_NAME} || { echo "git clone failed"; exit 1; } - -if [ -d "${RELIC_DIR}" ] -then - ( - cd ${RELIC_DIR_NAME} || { echo "cd relic failed"; exit 1; } - git checkout $relic_version - ) - # build relic - bash relic_build.sh -else - { echo "couldn't find relic directory"; exit 1; } -fi - diff --git a/crypto/relic_build.sh b/crypto/relic_build.sh deleted file mode 100755 index 6cff3a6b478..00000000000 --- a/crypto/relic_build.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -pushd "$DIR" - -# Ensure the directory is writeable -chmod -R +w "$(pwd)" - -mkdir -p "$DIR/relic/build" -pushd "$DIR/relic/build" - - -# make cmake print its CC interpretation -CMAKE_FILE="${DIR}/relic/CMakeLists.txt" -# parameter expansion is not suitable here -# shellcheck disable=SC2089 -CMAKE_PRINT_CC="message ( STATUS \"CC=\$ENV{CC}\" )" -# Make the cmake run print its interpretation of CC -echo "$CMAKE_PRINT_CC" >> "${CMAKE_FILE}" - -# Probe cmake's MakeFile generation and extract the CC version -CMAKE_TEMP=$(mktemp) -cmake .. > "$CMAKE_TEMP" -CC_VAL="$(tail -n 5 "$CMAKE_TEMP" | grep -oE -m 1 'CC=.*$')" -CC_VAL="${CC_VAL:3}" - -# de-mangle the CMakeLists file, using a temporary file for BSD compatibility -sed '$d' ../CMakeLists.txt > "$CMAKE_TEMP" -mv "$CMAKE_TEMP" ../CMakeLists.txt - -# default to which -CC_VAL=${CC_VAL:-"$(which cc)"} -CC_VERSION_STR="$($CC_VAL --version)" - -# we use uname to record which arch we are running on -ARCH=$(uname -m 2>/dev/null || true) - -if [[ "$ARCH" =~ "x86_64" ]]; then - # Compile as westmere arch to avoid cross-compilation issues on machines not supporting AVX extensions. - # Relic performance as used in flow crypto library is not impacted by whether it is compiled with "native" or "westmere", as proven by benchmark results. - MARCH="-march=westmere" -elif [[ "$ARCH" =~ ^(arm64|armv7|armv7s)$ && "${CC_VERSION_STR[0]}" =~ (clang) ]]; then - # the "-march=native" option is not supported with clang on ARM - MARCH="" -else - MARCH="-march=native" -fi - -# Set RELIC config for Flow -COMP=(-DCFLAGS="-O3 -funroll-loops -fomit-frame-pointer ${MARCH} -mtune=native") -GENERAL=(-DTIMER=CYCLE -DCHECK=OFF -DVERBS=OFF) -LIBS=(-DSHLIB=OFF -DSTLIB=ON) -RAND=(-DRAND=HASHD -DSEED=) - -# -BN_REP=(-DALLOC=AUTO -DALIGN=1 -DWSIZE=64 -DBN_PRECI=1024 -DBN_MAGNI=DOUBLE) -ARITH=(-DARITH=EASY) -PRIME=(-DFP_PRIME=381) - -# -BN_METH=(-DBN_KARAT=0 -DBN_METHD="COMBA;COMBA;MONTY;SLIDE;BINAR;BASIC") -FP_METH=(-DFP_KARAT=0 -DFP_METHD="INTEG;INTEG;INTEG;MONTY;MONTY;JMPDS;SLIDE") -PRIMES=(-DFP_PMERS=OFF -DFP_QNRES=ON) -FPX_METH=(-DFPX_METHD="INTEG;INTEG;LAZYR") -EP_METH=(-DEP_MIXED=ON -DEP_PLAIN=OFF -DEP_ENDOM=ON -DEP_SUPER=OFF\ - -DEP_CTMAP=ON -DEP_METHD="JACOB;LWNAF;COMBS;INTER") -PP_METH=(-DPP_METHD="LAZYR;OATEP") - -# run cmake -cmake "${COMP[@]}" "${GENERAL[@]}" \ - "${LIBS[@]}" "${RAND[@]}" \ - "${BN_REP[@]}" "${ARITH[@]}" \ - "${PRIME[@]}" "${PRIMES[@]}" \ - "${EP_METH[@]}" \ - "${BN_METH[@]}" \ - "${FP_METH[@]}" \ - "${FPX_METH[@]}" \ - "${PP_METH[@]}" .. - - -# Compile the static library -make clean -make relic_s -j8 -rm -f CMakeCache.txt - -popd -popd From 19a21db959b5fdfb68e30d5350bbc50648247f16 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 14:59:13 -0600 Subject: [PATCH 106/200] remove relic macros and xmd_sha256, remove relic binray from LD flags --- crypto/bls.go | 2 -- crypto/bls12381_utils.c | 6 +++--- crypto/bls12381_utils.go | 1 + crypto/bls12381_utils.h | 6 +++--- crypto/bls_include.h | 1 - crypto/bls_multisig.go | 3 +-- crypto/dkg_jointfeldman.go | 2 -- crypto/spock.go | 2 -- 8 files changed, 8 insertions(+), 15 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index c2e43aee908..f49f4661772 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -28,8 +28,6 @@ package crypto // - membership checks G2 using Bowe's method (https://eprint.iacr.org/2019/814.pdf) // - implement a G1/G2 swap (signatures on G2 and public keys on G1) -// #cgo CFLAGS: -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "bls_include.h" import "C" diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 4181d69fbc7..a6b1e5c5e44 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1154,10 +1154,10 @@ void multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) { final_exp(res_vec, res_vec); } -// This is a testing function. -// It wraps a call to a Relic macro since cgo can't call macros. +// This is a testing function and is not used in exported functions +// It uses an expand message XMD based on SHA2-256. void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){ - md_xmd_sh256(hash, len_hash, msg, len_msg, dst, len_dst); + expand_message_xmd(hash, len_hash, NULL, 0, msg, len_msg, dst, len_dst); } diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index d56be090332..9695d45aba2 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -12,6 +12,7 @@ package crypto // #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) // # include // # include +// # include // static void handler(int signum) // { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=-O -D__BLST_PORTABLE__"; // ssize_t n = write(2, &text, strlen(text)); diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 826872c0e7c..d2f2d8b489f 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -5,12 +5,12 @@ #ifndef _REL_MISC_INCLUDE_H #define _REL_MISC_INCLUDE_H -#include "relic.h" +#include #include "blst_include.h" #define SEC_BITS 128 -#define VALID RLC_OK -#define INVALID RLC_ERR +#define VALID 0 +#define INVALID 1 #define UNDEFINED (((VALID&1)^1) | ((INVALID&2)^2)) // different value than RLC_OK and RLC_ERR #define BITS_TO_BYTES(x) ((x+7)>>3) diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 7060ac10bdc..4b8e1075501 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -3,7 +3,6 @@ #ifndef _REL_BLS_INCLUDE_H #define _REL_BLS_INCLUDE_H -#include "relic.h" #include "bls12381_utils.h" // Signature, Public key and Private key lengths diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index aa8e669924a..e451c8d41f5 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -30,8 +30,7 @@ import ( // - batch verification of multiple signatures of a single message under multiple // public keys: use a binary tree of aggregations to find the invalid signatures. -// #cgo CFLAGS: -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s +// #include "bls12381_utils.h" // #include "bls_include.h" import "C" diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index c4fb23f578e..40db316efb5 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -1,7 +1,5 @@ package crypto -// #cgo CFLAGS: -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "dkg_include.h" import "C" diff --git a/crypto/spock.go b/crypto/spock.go index 46673b0bb13..8180b9b72bd 100644 --- a/crypto/spock.go +++ b/crypto/spock.go @@ -3,8 +3,6 @@ package crypto // SPoCK design based on the BLS signature scheme. // BLS is using BLS12-381 curve and the same settings in bls.go. -// #cgo CFLAGS: -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "bls_include.h" import "C" import ( From e58fe245920f8be1fbc7a4759ab3bf3c723062e1 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 17:33:01 -0600 Subject: [PATCH 107/200] update Makefile and dockerignore --- crypto/.dockerignore | 1 - crypto/Makefile | 34 ++++++---------------------------- 2 files changed, 6 insertions(+), 29 deletions(-) delete mode 100644 crypto/.dockerignore diff --git a/crypto/.dockerignore b/crypto/.dockerignore deleted file mode 100644 index 5c75f82093a..00000000000 --- a/crypto/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -relic/build diff --git a/crypto/Makefile b/crypto/Makefile index d87f27c440f..a75e00df15b 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -12,41 +12,19 @@ endif ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) -.PHONY: setup -setup: - go generate - -# test BLS-related functionalities requiring the Relic library (and hence relic Go build flag) -.PHONY: relic_tests -relic_tests: +# test all packages +.PHONY: test +test: +# root package (it uses BLST source files underneath which requires testing for ADX support) ifeq ($(ADX_SUPPORT), 1) - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,) + go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) else - CGO_CFLAGS="-O -D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,) + CGO_CFLAGS="-O -D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) endif - -# test all packages that do not require Relic library (all functionalities except the BLS-related ones) -.PHONY: non_relic_tests -non_relic_tests: -# root package without relic - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) # sub packages go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random -############################################################################################ -# CAUTION: DO NOT MODIFY THIS TARGET! DOING SO WILL BREAK THE FLAKY TEST MONITOR - -# sets up the crypto module and runs all tests -.PHONY: test -test: setup unittest - -# runs the unit tests of the module (assumes the module was set up) -.PHONY: unittest -unittest: relic_tests non_relic_tests - -############################################################################################ - .PHONY: docker-build docker-build: docker build -t gcr.io/dl-flow/golang-cmake:latest -t gcr.io/dl-flow/golang-cmake:$(IMAGE_TAG) . From 81239706a01f8afa320f251dc3153195f86e5a7a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 18:17:59 -0600 Subject: [PATCH 108/200] remove Relic mentions in code and README --- crypto/README.md | 91 +++++------------------------------------- crypto/bls.go | 24 +++++------ crypto/bls_multisig.go | 13 +++--- crypto/sign.go | 4 +- 4 files changed, 31 insertions(+), 101 deletions(-) diff --git a/crypto/README.md b/crypto/README.md index 9f29ad03e16..97156fa52c9 100644 --- a/crypto/README.md +++ b/crypto/README.md @@ -6,86 +6,22 @@ Most of the primitives and protocols can be used in other projects and are not s Flow is an ongoing project, which means that new features will still be added and modifications will still be made to improve security and performance of the cryptography package. Notes: - - The package has been audited for security in January 2021 on [this version](https://github.com/onflow/flow-go/tree/2707acdabb851138e298b2d186e73f47df8a14dd). The package had a few improvements since. + - The package has been audited for security in January 2021 on [this version](https://github.com/onflow/flow-go/tree/2707acdabb851138e298b2d186e73f47df8a14dd). The package had a major refactor to switch all the BLS12-381 curve implementation to use [BLST](https://github.com/supranational/blst/tree/master/src) starting from [this version](TODO: link the commit/tag). - The package does not provide security against side channel or fault attacks. ## Package import -Cloning Flow repository and following the [installation steps](https://github.com/onflow/flow-go) builds the necessary tools to use Flow cryptography. +To use the Flow cryptography package, you can: -If you wish to only import the Flow cryptography package into your Go project, please follow the following steps: - -- Get Flow cryptography package +- get the package ``` go get github.com/onflow/flow-go/crypto ``` -or simply import the package to your Go project +- or simply import the package to your Go project ``` import "github.com/onflow/flow-go/crypto" ``` -This is enough to run the package code for many functionalities. However, this isn't enough if BLS signature related functionalities are used. The BLS features rely on an extrnal C library ([Relic](https://github.com/relic-toolkit/relic)) for lower level mathematical operations. Building your project at this stage including BLS functionalities would result in build errors related to missing "relic" files. For instance: -``` -fatal error: 'relic.h' file not found -#include "relic.h" - ^~~~~~~~~ -``` - - An extra step is required to compile the external dependency (Relic) locally. - -- Install [CMake](https://cmake.org/install/), which is used for building the package. The build also requires [Git](http://git-scm.com/) and bash scripting. -- From the Go package directory in `$GOPATH/pkg/mod/github.com/onflow/flow-go/crypto@/`, build the package dependencies. `version-tag` is the imported package version. -For instance: -``` -cd $GOPATH/pkg/mod/github.com/onflow/flow-go/crypto@v0.25.0/ -go generate -``` - -Below is a bash script example to automate the above steps. The script can be copied into your Go project root directory. -It extracts the imported pacakage version from your project's go.mod file and performs the remaining steps. -```bash -#!/bin/bash - -# crypto package -PKG_NAME="github.com/onflow/flow-go/crypto" - -# go get the package -go get ${PKG_NAME} - -# go.mod -MOD_FILE="./go.mod" - -# the version of onflow/flow-go/crypto used in the project is read from the go.mod file -if [ -f "${MOD_FILE}" ] -then - # extract the version from the go.mod file - VERSION="$(grep ${PKG_NAME} < ${MOD_FILE} | cut -d' ' -f 2)" - # using the right version, get the package directory path - PKG_DIR="$(go env GOPATH)/pkg/mod/${PKG_NAME}@${VERSION}" -else - { echo "couldn't find go.mod file - make sure the script is in the project root directory"; exit 1; } -fi - -# grant permissions if not existant -if [[ ! -r ${PKG_DIR} || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then - sudo chmod -R 755 "${PKG_DIR}" -fi - -# get into the package directory and set up the external dependencies -( - cd "${PKG_DIR}" || { echo "cd into the GOPATH package folder failed"; exit 1; } - go generate -) -``` - - -Finally, when building your project and including any BLS functionality, adding a Go build tag to include the BLS files in the build is required. -The tag is not required when the package is used without BLS functions. It was introduced to avoid build errors when BLS (and therefore Relic) is not needed. - -``` -go build -tags=relic -``` - ## Algorithms ### Hashing and Message Authentication Code: @@ -103,11 +39,11 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey` * ECDSA * public keys are compressed or uncompressed. - * ephemeral key is derived from the private key, hash and an external entropy using a CSPRNG (based on https://golang.org/pkg/crypto/ecdsa/). + * ephemeral key is derived from the private key, hash and the system entropy (based on https://golang.org/pkg/crypto/ecdsa/). * supports NIST P-256 (secp256r1) and secp256k1 curves. * BLS - * supports [BLS 12-381](https://electriccoin.co/blog/new-snark-curve/) curve. + * supports [BLS12-381](https://electriccoin.co/blog/new-snark-curve/) curve. * is implementing the minimal-signature-size variant: signatures in G1 and public keys in G2. * default set-up uses [compressed](https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) G1/G2 points, @@ -119,16 +55,14 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey` and BLS_POP_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for proofs of possession. * signature verification includes the signature membership check in G1. * public key membership check in G2 is provided outside of the signature verification. - * membership check in G1 is using [Bowe's fast check](https://eprint.iacr.org/2019/814.pdf), while membership check in G2 is using a simple scalar multiplication by the group order (both will be updated to use Scott's method) - * non-interactive aggregation of signatures, public keys and private keys. - * multi-signature verification of an aggregated signature of a single message under multiple public keys. - * multi-signature verification of an aggregated signature of multiple messages under multiple public keys. + * aggregation of signatures, public keys and private keys. + * verification of an aggregated signature of a single message under multiple public keys. + * verification of an aggregated signature of multiple messages under multiple public keys. * batch verification of multiple signatures of a single message under multiple - public keys: use a binary tree of aggregations to find the invalid signatures. + public keys, using a binary tree of aggregations. * SPoCK scheme based on BLS: verifies two signatures have been generated from the same message that is unknown to the verifier. * Future features: - * membership checks in G1/G2 using [Scotts's method](https://eprint.iacr.org/2021/1130.pdf). * support minimal-pubkey-size variant ### PRNG @@ -146,9 +80,6 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey` * key generation (single dealer) to provide the set of keys. * provides a stateless api and a stateful api. - * Future features: - * support a partial signature reconstruction in the stateful api to avoid a long final reconstruction. - ### Discrete-Log based distributed key generation @@ -158,7 +89,7 @@ All supported Distributed Key Generation protocols are [discrete log based](http * simple verifiable secret sharing with a single dealer. * the library does not implement the communication channels between participants. The caller should implement the methods `PrivateSend` (1-to-1 messaging) and `Broadcast` (1-to-n messaging) * 1-to-1 messaging must be a private channel, the caller must make sure the channel preserves confidentialiy and authenticates the sender. - * 1-to-n broadcasting assume all destination participants receive the same copy of the message. The channel should also authenticate the broadcaster. + * 1-to-n broadcasting is a reliable broadcast, where honest senders are able to reach all honest receivers, and where all honest receivers end up with the same received messages. The channel should also authenticate the broadcaster. * It is recommended that both communication channels are unique per protocol instance. This could be achieved by prepending the messages to send/broadcast by a unique protocol instance ID. * Feldman VSS Qual. * an extension of the simple Feldman VSS. diff --git a/crypto/bls.go b/crypto/bls.go index f49f4661772..c8650c9dc60 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -1,12 +1,13 @@ package crypto -// BLS signature scheme implementation using BLS12-381 curve -// ([zcash]https://electriccoin.co/blog/new-snark-curve/) -// Pairing, ellipic curve and modular arithmetic is using Relic library. -// This implementation does not include any security against side-channel attacks. - -// existing features: -// - the implementation variant is minimal-signature-size signatures: +// BLS signature scheme implementation using the BLS12-381 curve +// ([zcash]https://electriccoin.co/blog/new-snark-curve/). +// Pairing, ellipic curve and modular arithmetic are using [BLST](https://github.com/supranational/blst/tree/master/src) +// tools underneath. +// This implementation does not include security against side-channel or fault attacks. + +// Existing features: +// - the implementation variant is minimal-signature-size: // shorter signatures in G1, longer public keys in G2 // - serialization of points on G1 and G2 is compressed ([zcash] // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) @@ -18,15 +19,12 @@ package crypto // and BLS_POP_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for proofs of possession. // - signature verification checks the membership of signature in G1. // - the public key membership check in G2 is implemented separately from the signature verification. -// - membership check in G1 is implemented using fast Bowe's check (to be updated to Scott's check). -// - membership check in G2 is using a simple scalar multiplication with the group order (to be updated to Scott's check). // - multi-signature tools are defined in bls_multisg.go -// - SPoCK scheme based on BLS: verifies two signatures have been generated from the same message, -// that is unknown to the verifier. +// - SPoCK scheme based on BLS: verifies two signatures are generated from the same message, +// even though the message is unknown to the verifier. // future features: -// - membership checks G2 using Bowe's method (https://eprint.iacr.org/2019/814.pdf) -// - implement a G1/G2 swap (signatures on G2 and public keys on G1) +// - implement a G1/G2 swap (minimal-pubkey-size variant) // #include "bls_include.h" import "C" diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index e451c8d41f5..5714b7e2a34 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -15,20 +15,21 @@ import ( // BLS multi-signature using BLS12-381 curve // ([zcash]https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#bls12-381) -// Pairing, ellipic curve and modular arithmetic is using Relic library. -// This implementation does not include any security against side-channel attacks. +// Pairing, ellipic curve and modular arithmetic are using [BLST](https://github.com/supranational/blst/tree/master/src) +// tools underneath. +// This implementation does not include any security against side-channel side-channel or fault attacks. -// existing features: +// Existing features: // - the same BLS set-up in bls.go // - Use the proof of possession scheme (PoP) to prevent against rogue public-key attack. -// - Non-interactive aggregation of private keys, public keys and signatures. -// - Non-interactive subtraction of multiple public keys from an (aggregated) public key. +// - Aggregation of private keys, public keys and signatures. +// - Subtraction of multiple public keys from an (aggregated) public key. // - Multi-signature verification of an aggregated signature of a single message // under multiple public keys. // - Multi-signature verification of an aggregated signature of multiple messages under // multiple public keys. // - batch verification of multiple signatures of a single message under multiple -// public keys: use a binary tree of aggregations to find the invalid signatures. +// public keys, using a binary tree of aggregations. // #include "bls12381_utils.h" // #include "bls_include.h" diff --git a/crypto/sign.go b/crypto/sign.go index 788a55618d4..ff4348f3b09 100644 --- a/crypto/sign.go +++ b/crypto/sign.go @@ -49,7 +49,7 @@ type signer interface { decodePublicKeyCompressed([]byte) (PublicKey, error) } -// newSigner returns a signer that does not depend on Relic library. +// newSigner returns a signer instance func newSigner(algo SigningAlgorithm) (signer, error) { switch algo { case ECDSAP256: @@ -63,7 +63,7 @@ func newSigner(algo SigningAlgorithm) (signer, error) { } } -// Initialize the context of all algos not requiring Relic +// Initialize the context of all algos func init() { // P-256 p256Instance = &(ecdsaAlgo{ From 541df79e5aa0f017cf0e2a55cee38a938eacc66f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 18:23:23 -0600 Subject: [PATCH 109/200] update flow-go/README and gitignore --- .gitignore | 2 -- README.md | 7 ------- 2 files changed, 9 deletions(-) diff --git a/.gitignore b/.gitignore index 472cc944ee4..7d437f2c93e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,6 @@ /cmd/util/util /cmd/bootstrap/bootstrap -# crypto relic folder -crypto/relic/ # Test binary, build with `go test -c` *.test diff --git a/README.md b/README.md index 39bd7a13e3e..291e45de347 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,6 @@ The following table lists all work streams and links to their home directory and - Clone this repository - Install [Go](https://golang.org/doc/install) (Flow supports Go 1.18 and later) -- Install [CMake](https://cmake.org/install/), which is used for building the crypto library - Install [Docker](https://docs.docker.com/get-docker/), which is used for running a local network and integration tests - Make sure the [`GOPATH`](https://golang.org/cmd/go/#hdr-GOPATH_environment_variable) and `GOBIN` environment variables are set, and `GOBIN` is added to your path: @@ -75,12 +74,6 @@ The following table lists all work streams and links to their home directory and At this point, you should be ready to build, test, and run Flow! 🎉 -Note: Whenever the crypto module version imported by "go.mod" is updated to a version that was never locally imported before, the crypto dependency needs to be set-up. If not, you should notice errors about "relic" or "crypto". Run the following command to set-up the new module version: - -```bash -make crypto_setup_gopath -``` - ## Development Workflow ### Testing From 1d0b8f906de5158f6358cd75543ef582953c9e4d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 18:43:06 -0600 Subject: [PATCH 110/200] remove relic related commands from Makefile/ci/dockerfile --- .github/workflows/bench.yml | 6 +-- .github/workflows/ci.yml | 37 ++++------------ .github/workflows/flaky-test-debug.yml | 14 ------- Makefile | 42 +++++++------------ cmd/bootstrap/README.md | 8 ++-- cmd/bootstrap/cmd/genconfig.go | 2 +- insecure/Makefile | 2 +- integration/Makefile | 24 +++++------ integration/benchmark/cmd/manual/Dockerfile | 16 +------ integration/benchnet2/Makefile | 8 ++-- integration/localnet/Makefile | 8 ++-- module/metrics/example/README.md | 2 +- .../level1/process_summary1_results_test.go | 12 +++--- utils/binstat/binstat_external_test.go | 2 +- 14 files changed, 61 insertions(+), 122 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index ada29474be7..7c3c6d896bd 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -49,14 +49,14 @@ jobs: - name: Run benchmark on current branch run: | - (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . --tags relic -shuffle=on --benchmem --run ^$; done) | tee new.txt + (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . -shuffle=on --benchmem --run ^$; done) | tee new.txt - name: Checkout base branch run: git checkout ${{ github.event.pull_request.base.sha }} - name: Run benchmark on base branch run: | - (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . --tags relic -shuffle=on --benchmem --run ^$; done) | tee old.txt + (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . -shuffle=on --benchmem --run ^$; done) | tee old.txt # see https://trstringer.com/github-actions-multiline-strings/ to see why this part is complex - name: Use benchstat for comparison @@ -85,7 +85,7 @@ jobs: This branch with compared with the base branch ${{ github.event.pull_request.base.label }} commit ${{ github.event.pull_request.base.sha }} - The command `(for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . --tags relic -shuffle=on --benchmem --run ^$; done)` was used. + The command `(for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . -shuffle=on --benchmem --run ^$; done)` was used.
Collapsed results for better readability diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08832eab401..57b0da2ace2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,8 +38,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Run go generate run: go generate working-directory: ${{ matrix.dir }} @@ -48,7 +46,7 @@ jobs: with: # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version. version: v1.49 - args: -v --build-tags relic + args: -v working-directory: ${{ matrix.dir }} # https://github.com/golangci/golangci-lint-action/issues/244 skip-cache: true @@ -66,20 +64,6 @@ jobs: cache: true - name: Run tidy run: make tidy - - name: Emulator no relic check - run: make emulator-norelic-check - - shell-check: - name: ShellCheck - runs-on: ubuntu-latest - steps: - - name: Checkout repo - uses: actions/checkout@v3 - - name: Run ShellCheck - uses: ludeeus/action-shellcheck@203a3fd018dfe73f8ae7e3aa8da2c149a5f41c33 - with: - scandir: './crypto' - ignore: 'relic' create-dynamic-test-matrix: name: Create Dynamic Test Matrix @@ -141,18 +125,15 @@ jobs: matrix: include: - name: crypto - make1: -C crypto setup - make2: unittest + setup: retries: 1 race: 1 - name: insecure - make1: install-tools - make2: test + setup: install-tools retries: 3 race: 1 - name: integration - make1: install-tools - make2: test + setup: install-tools retries: 3 race: 0 runs-on: ubuntu-latest @@ -165,7 +146,7 @@ jobs: go-version: ${{ env.GO_VERSION }} cache: true - name: Setup tests (${{ matrix.name }}) - run: make ${{ matrix.make1 }} + run: make ${{ matrix.setup }} - name: Run tests (${{ matrix.name }}) env: RACE_DETECTOR: ${{ matrix.race }} @@ -173,8 +154,8 @@ jobs: with: timeout_minutes: 25 max_attempts: ${{ matrix.retries }} - # run `make2` target inside each module's root - command: VERBOSE=1 make -C ${{ matrix.name }} ${{ matrix.make2 }} + # run test target inside each module's root + command: VERBOSE=1 make -C ${{ matrix.name }} test - name: Upload coverage report uses: codecov/codecov-action@v3 with: @@ -208,8 +189,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Docker build run: make docker-build-flow docker-build-flow-corrupt - name: Run tests @@ -235,7 +214,7 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic and other tools + - name: install tools run: make install-tools - name: Install Flow Client In Docker # This proved to be more reliable than installing it locally. diff --git a/.github/workflows/flaky-test-debug.yml b/.github/workflows/flaky-test-debug.yml index 3a5b47e2c2f..3e5092c9f07 100644 --- a/.github/workflows/flaky-test-debug.yml +++ b/.github/workflows/flaky-test-debug.yml @@ -55,20 +55,6 @@ jobs: cache: true - name: Run tidy run: make tidy - - name: Emulator no relic check - run: make emulator-norelic-check - - # shell-check: - # name: ShellCheck - # runs-on: ubuntu-latest - # steps: - # - name: Checkout repo - # uses: actions/checkout@v3 - # - name: Run ShellCheck - # uses: ludeeus/action-shellcheck@203a3fd018dfe73f8ae7e3aa8da2c149a5f41c33 - # with: - # scandir: './crypto' - # ignore: 'relic' create-dynamic-test-matrix: name: Create Dynamic Test Matrix diff --git a/Makefile b/Makefile index d0a8fd10c23..6d9b2321bab 100644 --- a/Makefile +++ b/Makefile @@ -42,19 +42,11 @@ K8S_YAMLS_LOCATION_STAGING=./k8s/staging export CONTAINER_REGISTRY := gcr.io/flow-container-registry export DOCKER_BUILDKIT := 1 -# setup the crypto package under the GOPATH: needed to test packages importing flow-go/crypto -# TODO: replace by bash crypto_setup.sh after removing replace statements -.PHONY: crypto_setup_gopath -crypto_setup_gopath: - (cd ./crypto && make setup) - - - cmd/collection/collection: go build -o cmd/collection/collection cmd/collection/main.go cmd/util/util: - go build -o cmd/util/util --tags relic cmd/util/main.go + go build -o cmd/util/util cmd/util/main.go .PHONY: update-core-contracts-version update-core-contracts-version: @@ -65,8 +57,8 @@ update-core-contracts-version: .PHONY: unittest-main unittest-main: - # test all packages with Relic library enabled - go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(GO_TEST_PACKAGES) + # test all packages + go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(GO_TEST_PACKAGES) .PHONY: install-mock-generators install-mock-generators: @@ -88,15 +80,10 @@ verify-mocks: generate-mocks ############################################################################################ -.PHONY: emulator-norelic-check -emulator-norelic-check: - # test the fvm package compiles with Relic library disabled (required for the emulator build) - cd ./fvm && go test ./... -run=NoTestHasThisPrefix - .PHONY: fuzz-fvm fuzz-fvm: # run fuzz tests in the fvm package - cd ./fvm && go test -fuzz=Fuzz -run ^$$ --tags relic + cd ./fvm && go test -fuzz=Fuzz -run ^$$ .PHONY: test test: verify-mocks unittest-main @@ -154,7 +141,7 @@ generate-mocks: install-mock-generators mockery --name 'ProviderEngine' --dir=engine/execution/provider --case=underscore --output="engine/execution/provider/mock" --outpkg="mock" (cd ./crypto && mockery --name 'PublicKey' --case=underscore --output="../module/mock" --outpkg="mock") mockery --name '.*' --dir=state/cluster --case=underscore --output="state/cluster/mock" --outpkg="mock" - mockery --name '.*' --dir=module --case=underscore --tags="relic" --output="./module/mock" --outpkg="mock" + mockery --name '.*' --dir=module --case=underscore --output="./module/mock" --outpkg="mock" mockery --name '.*' --dir=module/mempool --case=underscore --output="./module/mempool/mock" --outpkg="mempool" mockery --name '.*' --dir=module/component --case=underscore --output="./module/component/mock" --outpkg="component" mockery --name '.*' --dir=network --case=underscore --output="./network/mocknetwork" --outpkg="mocknetwork" @@ -182,7 +169,7 @@ generate-mocks: install-mock-generators mockery --name 'API' --dir="./engine/protocol" --case=underscore --output="./engine/protocol/mock" --outpkg="mock" mockery --name 'API' --dir="./engine/access/state_stream" --case=underscore --output="./engine/access/state_stream/mock" --outpkg="mock" mockery --name 'ConnectionFactory' --dir="./engine/access/rpc/backend" --case=underscore --output="./engine/access/rpc/backend/mock" --outpkg="mock" - mockery --name 'IngestRPC' --dir="./engine/execution/ingestion" --case=underscore --tags relic --output="./engine/execution/ingestion/mock" --outpkg="mock" + mockery --name 'IngestRPC' --dir="./engine/execution/ingestion" --case=underscore --output="./engine/execution/ingestion/mock" --outpkg="mock" mockery --name '.*' --dir=model/fingerprint --case=underscore --output="./model/fingerprint/mock" --outpkg="mock" mockery --name 'ExecForkActor' --structname 'ExecForkActorMock' --dir=module/mempool/consensus/mock/ --case=underscore --output="./module/mempool/consensus/mock/" --outpkg="mock" mockery --name '.*' --dir=engine/verification/fetcher/ --case=underscore --output="./engine/verification/fetcher/mock" --outpkg="mockfetcher" @@ -207,12 +194,12 @@ tidy: .PHONY: lint lint: tidy # revive -config revive.toml -exclude storage/ledger/trie ./... - golangci-lint run -v --build-tags relic ./... + golangci-lint run -v ./... .PHONY: fix-lint fix-lint: # revive -config revive.toml -exclude storage/ledger/trie ./... - golangci-lint run -v --build-tags relic --fix ./... + golangci-lint run -v --fix ./... # Runs unit tests with different list of packages as passed by CI so they run in parallel .PHONY: ci @@ -242,7 +229,6 @@ docker-ci: # Runs integration tests in Docker (for mac) .PHONY: docker-ci-integration docker-ci-integration: - rm -rf crypto/relic docker run \ --env DOCKER_API_VERSION='1.39' \ --network host \ @@ -262,7 +248,7 @@ docker-build-collection: .PHONY: docker-build-collection-without-netgo docker-build-collection-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG_NO_NETGO)" . @@ -281,7 +267,7 @@ docker-build-consensus: .PHONY: docker-build-consensus-without-netgo docker-build-consensus-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG_NO_NETGO)" . @@ -300,7 +286,7 @@ docker-build-execution: .PHONY: docker-build-execution-without-netgo docker-build-execution-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG_NO_NETGO)" . @@ -329,7 +315,7 @@ docker-build-verification: .PHONY: docker-build-verification-without-netgo docker-build-verification-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG_NO_NETGO)" . @@ -358,7 +344,7 @@ docker-build-access: .PHONY: docker-build-access-without-netgo docker-build-access-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG_NO_NETGO)" . @@ -387,7 +373,7 @@ docker-build-observer: .PHONY: docker-build-observer-without-netgo docker-build-observer-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG_NO_NETGO)" . diff --git a/cmd/bootstrap/README.md b/cmd/bootstrap/README.md index 9000f4d87f4..5fd2964faf5 100644 --- a/cmd/bootstrap/README.md +++ b/cmd/bootstrap/README.md @@ -46,7 +46,7 @@ _Each cluster_ of collector nodes needs to have its own root Block and root QC # Usage -`go run -tags relic ./cmd/bootstrap` prints usage information +`go run ./cmd/bootstrap` prints usage information ## Phase 1: Generate networking and staking keys for partner nodes: @@ -65,7 +65,7 @@ If seeds are not provided, the CLI will try to use the system's pseudo-random nu #### Example ```bash -go run -tags relic ./cmd/bootstrap key --address "example.com:1234" --role "consensus" -o ./bootstrap/partner-node-infos +go run ./cmd/bootstrap key --address "example.com:1234" --role "consensus" -o ./bootstrap/partner-node-infos ``` #### Generated output files @@ -97,7 +97,7 @@ Each input is a config file specified as a command line parameter: #### Example ```bash -go run -tags relic ./cmd/bootstrap finalize \ +go run ./cmd/bootstrap finalize \ --fast-kg \ --root-chain main \ --root-height 0 \ @@ -153,7 +153,7 @@ go run -tags relic ./cmd/bootstrap finalize \ This generates the networking key used by observers to connect to the public libp2p network. It is a different key format than staked nodes and should only be used for Observers. ```bash -go run -tags relic ./cmd/bootstrap observer-network-key -f ./path/network-key +go run ./cmd/bootstrap observer-network-key -f ./path/network-key ``` This key must be kept secret as it's used to encrypt and sign network requests sent by the observers. diff --git a/cmd/bootstrap/cmd/genconfig.go b/cmd/bootstrap/cmd/genconfig.go index 404bd5e873e..ccf66104ecc 100644 --- a/cmd/bootstrap/cmd/genconfig.go +++ b/cmd/bootstrap/cmd/genconfig.go @@ -63,7 +63,7 @@ func genconfigCmdRun(_ *cobra.Command, _ []string) { var genconfigCmd = &cobra.Command{ Use: "genconfig", Short: "Generate node-config.json", - Long: "example: go run -tags relic ./cmd/bootstrap genconfig --address-format \"%s-%03d.devnet19.nodes.onflow.org:3569\" --access 2 --collection 3 --consensus 3 --execution 2 --verification 1 --weight 100", + Long: "example: go run ./cmd/bootstrap genconfig --address-format \"%s-%03d.devnet19.nodes.onflow.org:3569\" --access 2 --collection 3 --consensus 3 --execution 2 --verification 1 --weight 100", Run: genconfigCmdRun, } diff --git a/insecure/Makefile b/insecure/Makefile index 72a38cf4b4d..9872f01b1d8 100644 --- a/insecure/Makefile +++ b/insecure/Makefile @@ -11,4 +11,4 @@ endif # runs all unit tests of the insecure module .PHONY: test test: - go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic ./... + go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./... diff --git a/integration/Makefile b/integration/Makefile index a4f354c7e4d..7751b4ee333 100644 --- a/integration/Makefile +++ b/integration/Makefile @@ -22,53 +22,53 @@ ci-integration-test: access-tests ghost-tests mvp-tests epochs-tests consensus-t # Run unit tests for test utilities in this module .PHONY: test test: - go test $(if $(VERBOSE),-v,) -tags relic -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests` + go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests` .PHONY: access-tests access-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/access/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/... .PHONY: collection-tests collection-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/collection/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/collection/... .PHONY: consensus-tests consensus-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/consensus/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/consensus/... .PHONY: epochs-tests epochs-tests: # Use a higher timeout of 20m for the suite of tests which span full epochs - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic -timeout 30m ./tests/epochs/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 30m ./tests/epochs/... .PHONY: ghost-tests ghost-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/ghost/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/ghost/... .PHONY: mvp-tests mvp-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/mvp/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/mvp/... .PHONY: execution-tests execution-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/execution/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/execution/... .PHONY: verification-tests verification-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/verification/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/verification/... .PHONY: upgrades-tests upgrades-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/upgrades/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/... .PHONY: network-tests network-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/network/... + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/... # BFT tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel .PHONY: bft-tests bft-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/... -p 1 + go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/... -p 1 ############################################################################################ diff --git a/integration/benchmark/cmd/manual/Dockerfile b/integration/benchmark/cmd/manual/Dockerfile index 1ad38985a43..8d474efd3dc 100644 --- a/integration/benchmark/cmd/manual/Dockerfile +++ b/integration/benchmark/cmd/manual/Dockerfile @@ -4,20 +4,11 @@ FROM golang:1.19-buster AS build-setup RUN apt-get update -RUN apt-get -y install cmake zip - -## (1) Build Relic first to maximize caching -FROM build-setup AS build-relic +RUN apt-get -y install zip RUN mkdir /build WORKDIR /build -# Copy over the crypto package -COPY crypto ./crypto - -# Build Relic (this places build artifacts in /build/relic/build) -RUN cd ./crypto/ && go generate - ## (2) Build the app binary FROM build-setup AS build-env @@ -35,9 +26,6 @@ ARG TARGET COPY . . -# Copy over Relic build artifacts -COPY --from=build-relic /build/crypto/relic/build ./crypto/relic/build - FROM build-env as build-production WORKDIR /app @@ -48,7 +36,7 @@ RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ --mount=type=ssh \ cd integration && \ - CGO_ENABLED=1 go build --tags relic -ldflags "-extldflags -static" -o ./app ./${TARGET} + CGO_ENABLED=1 go build -ldflags "-extldflags -static" -o ./app ./${TARGET} RUN mv /app/integration/app /app/app diff --git a/integration/benchnet2/Makefile b/integration/benchnet2/Makefile index 62859fbf74c..f1979c0f1b4 100644 --- a/integration/benchnet2/Makefile +++ b/integration/benchnet2/Makefile @@ -29,12 +29,12 @@ endif # for the checked out version will be run in the sub folder but the bootstrap folder will be created here (outside of the checked out flow-go in the sub folder) gen-bootstrap: clone-flow cd flow-go && make crypto_setup_gopath - cd flow-go/cmd/bootstrap && go run -tags relic . genconfig --address-format "%s%d-${NETWORK_ID}.${NAMESPACE}:3569" --access $(ACCESS) --collection $(COLLECTION) --consensus $(CONSENSUS) --execution $(EXECUTION) --verification $(VERIFICATION) --weight 100 -o ./ --config ../../../bootstrap/conf/node-config.json - cd flow-go/cmd/bootstrap && go run -tags relic . keygen --machine-account --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/keys + cd flow-go/cmd/bootstrap && go run . genconfig --address-format "%s%d-${NETWORK_ID}.${NAMESPACE}:3569" --access $(ACCESS) --collection $(COLLECTION) --consensus $(CONSENSUS) --execution $(EXECUTION) --verification $(VERIFICATION) --weight 100 -o ./ --config ../../../bootstrap/conf/node-config.json + cd flow-go/cmd/bootstrap && go run . keygen --machine-account --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/keys echo {} > ./bootstrap/conf/partner-stakes.json mkdir ./bootstrap/partner-nodes - cd flow-go/cmd/bootstrap && go run -tags relic . rootblock --root-chain bench --root-height 0 --root-parent 0000000000000000000000000000000000000000000000000000000000000000 --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --fast-kg --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --internal-priv-dir ../../../bootstrap/keys/private-root-information - cd flow-go/cmd/bootstrap && go run -tags relic . finalize --root-commit 0000000000000000000000000000000000000000000000000000000000000000 --service-account-public-key-json "{\"PublicKey\":\"R7MTEDdLclRLrj2MI1hcp4ucgRTpR15PCHAWLM5nks6Y3H7+PGkfZTP2di2jbITooWO4DD1yqaBSAVK8iQ6i0A==\",\"SignAlgo\":2,\"HashAlgo\":1,\"SeqNumber\":0,\"Weight\":1000}" --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --collection-clusters 1 --epoch-counter 0 --epoch-length 30000 --epoch-staking-phase-length 20000 --epoch-dkg-phase-length 2000 --genesis-token-supply="1000000000.0" --protocol-version=0 --internal-priv-dir ../../../bootstrap/keys/private-root-information --dkg-data ../../../bootstrap/private-root-information/root-dkg-data.priv.json --root-block ../../../bootstrap/public-root-information/root-block.json --root-block-votes-dir ../../../bootstrap/public-root-information/root-block-votes/ --epoch-commit-safety-threshold=1000 + cd flow-go/cmd/bootstrap && go run . rootblock --root-chain bench --root-height 0 --root-parent 0000000000000000000000000000000000000000000000000000000000000000 --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --fast-kg --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --internal-priv-dir ../../../bootstrap/keys/private-root-information + cd flow-go/cmd/bootstrap && go run . finalize --root-commit 0000000000000000000000000000000000000000000000000000000000000000 --service-account-public-key-json "{\"PublicKey\":\"R7MTEDdLclRLrj2MI1hcp4ucgRTpR15PCHAWLM5nks6Y3H7+PGkfZTP2di2jbITooWO4DD1yqaBSAVK8iQ6i0A==\",\"SignAlgo\":2,\"HashAlgo\":1,\"SeqNumber\":0,\"Weight\":1000}" --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --collection-clusters 1 --epoch-counter 0 --epoch-length 30000 --epoch-staking-phase-length 20000 --epoch-dkg-phase-length 2000 --genesis-token-supply="1000000000.0" --protocol-version=0 --internal-priv-dir ../../../bootstrap/keys/private-root-information --dkg-data ../../../bootstrap/private-root-information/root-dkg-data.priv.json --root-block ../../../bootstrap/public-root-information/root-block.json --root-block-votes-dir ../../../bootstrap/public-root-information/root-block-votes/ --epoch-commit-safety-threshold=1000 gen-helm-l1: go run automate/cmd/level1/bootstrap.go --data bootstrap/public-root-information/root-protocol-state-snapshot.json --dockerTag $(NETWORK_ID) --dockerRegistry $(DOCKER_REGISTRY) diff --git a/integration/localnet/Makefile b/integration/localnet/Makefile index f35cb0643e0..ac548916ae5 100644 --- a/integration/localnet/Makefile +++ b/integration/localnet/Makefile @@ -43,7 +43,7 @@ ifeq ($(strip $(VALID_EXECUTION)), 1) else ifeq ($(strip $(VALID_CONSENSUS)), 1) $(error Number of Consensus nodes should be no less than 2) else - go run -tags relic \ + go run \ -ldflags="-X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' \ -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ builder/*.go \ @@ -119,15 +119,15 @@ stop: .PHONY: load load: - go run --tags relic ../benchmark/cmd/manual -log-level info -tps 1,10,100 -tps-durations 30s,30s + go run ../benchmark/cmd/manual -log-level info -tps 1,10,100 -tps-durations 30s,30s .PHONY: tps-ci-smoke tps-ci-smoke: - go run --tags relic ../benchmark/cmd/ci -log-level info -tps-initial 1 -tps-min 1 -tps-max 10 -duration 20s -tps-adjust-interval 1s -stat-interval 1s -bigquery-upload=false + go run ../benchmark/cmd/ci -log-level info -tps-initial 1 -tps-min 1 -tps-max 10 -duration 20s -tps-adjust-interval 1s -stat-interval 1s -bigquery-upload=false .PHONY: tps-ci tps-ci: bootstrap-ci build-flow start-flow - go run --tags relic ../benchmark/cmd/ci -log-level info -tps-initial $(TPS_INIT) -tps-min $(TPS_MIN) -tps-max $(TPS_MAX) -duration $(DURATION) + go run ../benchmark/cmd/ci -log-level info -tps-initial $(TPS_INIT) -tps-min $(TPS_MIN) -tps-max $(TPS_MAX) -duration $(DURATION) .PHONY: clean-data clean-data: diff --git a/module/metrics/example/README.md b/module/metrics/example/README.md index f693cac0780..ec319414ad8 100644 --- a/module/metrics/example/README.md +++ b/module/metrics/example/README.md @@ -18,7 +18,7 @@ You can choose one of the following: Note: Running example with `-happypath` flag examines the metrics collection on a real happy path of verification node. ``` - go run --tags=relic module/metrics/example/verification/main.go + go run module/metrics/example/verification/main.go ``` - Consensus Node: ``` diff --git a/tools/test_monitor/level1/process_summary1_results_test.go b/tools/test_monitor/level1/process_summary1_results_test.go index c64f8442995..6e7b12f0551 100644 --- a/tools/test_monitor/level1/process_summary1_results_test.go +++ b/tools/test_monitor/level1/process_summary1_results_test.go @@ -33,19 +33,19 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) { RawJSONTestRunFile: "test-result-crypto-hash-1-count-skip-pass.json", }, - // raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/... + // raw results generated with: go test -json -count 1 ./utils/unittest/... "2 count all pass": { ExpectedLevel1Summary: testdata.GetTestData_Level1_2CountPass(), RawJSONTestRunFile: "test-result-crypto-hash-2-count-pass.json", }, - // raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/... + // raw results generated with: go test -json -count 1 ./utils/unittest/... "10 count all pass": { ExpectedLevel1Summary: testdata.GetTestData_Level1_10CountPass(), RawJSONTestRunFile: "test-result-crypto-hash-10-count-pass.json", }, - // raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/... + // raw results generated with: go test -json -count 1 ./utils/unittest/... "10 count some failures": { ExpectedLevel1Summary: testdata.GetTestData_Level1_10CountSomeFailures(), RawJSONTestRunFile: "test-result-crypto-hash-10-count-fail.json", @@ -54,14 +54,14 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) { // no result tests - tests below don't generate pass/fail result due to `go test` bug // with using `fmt.printf("log message")` without newline `\n` - // raw results generated with: go test -v -tags relic -count=1 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack + // raw results generated with: go test -v -count=1 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack // this is a single unit test that produces a no result "1 count single no result test": { ExpectedLevel1Summary: testdata.GetTestData_Level1_1CountSingleExceptionTest(), RawJSONTestRunFile: "test-result-exception-single-1-count-pass.json", }, - //raw results generated with: go test -v -tags relic -count=5 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack + //raw results generated with: go test -v -count=5 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack //multiple no result tests in a row "5 no result tests in a row": { ExpectedLevel1Summary: testdata.GetTestData_Level1_5CountSingleExceptionTest(), @@ -74,7 +74,7 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) { RawJSONTestRunFile: "test-result-exception-single-5-count-4-nil-1-normal-pass.json", }, - // raw results generated with: go test -v -tags relic -count=3 -json ./model/encodable/. + // raw results generated with: go test -v -count=3 -json ./model/encodable/. // group of unit tests with a single no result test "3 count no result test with normal tests": { ExpectedLevel1Summary: testdata.GetTestData_Leve1_3CountExceptionWithNormalTests(), diff --git a/utils/binstat/binstat_external_test.go b/utils/binstat/binstat_external_test.go index 9ffa7b23065..10f8b911ff9 100644 --- a/utils/binstat/binstat_external_test.go +++ b/utils/binstat/binstat_external_test.go @@ -28,7 +28,7 @@ import ( * 5. Strip "time" field from JSON log line output for shorter read, and * 6. Show the amount of code coverage from the tests. * - * pushd utils/binstat ; go fmt ./*.go ; golangci-lint run && go test -v -vv -coverprofile=coverage.txt -covermode=atomic --tags relic ./... | perl -lane 's~\\n~\n~g; s~"time".*?,~~g; print;' ; go tool cover -func=coverage.txt ; popd + * pushd utils/binstat ; go fmt ./*.go ; golangci-lint run && go test -v -vv -coverprofile=coverage.txt -covermode=atomic ./... | perl -lane 's~\\n~\n~g; s~"time".*?,~~g; print;' ; go tool cover -func=coverage.txt ; popd */ /* From 4eabaf1801d35d771dffb30c8ef8002b39df2c56 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 18:46:26 -0600 Subject: [PATCH 111/200] remove relic tags from go files --- .../verification/combined_verifier_v2.go | 3 -- .../verification/combined_verifier_v3.go | 3 -- .../hotstuff/verification/staking_verifier.go | 3 -- crypto_setup.sh | 32 ----------------- .../computation/computer/spock_norelic.go | 26 -------------- .../computation/computer/spock_relic.go | 3 -- module/dkg_broker.go | 3 -- module/signature/aggregation.go | 3 -- module/signature/aggregation_no_relic.go | 34 ------------------- module/signature/aggregation_test.go | 3 -- 10 files changed, 113 deletions(-) delete mode 100644 crypto_setup.sh delete mode 100644 engine/execution/computation/computer/spock_norelic.go delete mode 100644 module/signature/aggregation_no_relic.go diff --git a/consensus/hotstuff/verification/combined_verifier_v2.go b/consensus/hotstuff/verification/combined_verifier_v2.go index ee67a4ea36a..560cb1f8ece 100644 --- a/consensus/hotstuff/verification/combined_verifier_v2.go +++ b/consensus/hotstuff/verification/combined_verifier_v2.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package verification import ( diff --git a/consensus/hotstuff/verification/combined_verifier_v3.go b/consensus/hotstuff/verification/combined_verifier_v3.go index 8f5f9acd8f0..39af088ae0d 100644 --- a/consensus/hotstuff/verification/combined_verifier_v3.go +++ b/consensus/hotstuff/verification/combined_verifier_v3.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package verification import ( diff --git a/consensus/hotstuff/verification/staking_verifier.go b/consensus/hotstuff/verification/staking_verifier.go index 60b2f45f4d5..ecd5013f171 100644 --- a/consensus/hotstuff/verification/staking_verifier.go +++ b/consensus/hotstuff/verification/staking_verifier.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package verification import ( diff --git a/crypto_setup.sh b/crypto_setup.sh deleted file mode 100644 index e9789c74a23..00000000000 --- a/crypto_setup.sh +++ /dev/null @@ -1,32 +0,0 @@ - -#!/bin/bash - -# crypto package -PKG_NAME="github.com/onflow/flow-go/crypto" - -# go.mod -MOD_FILE="./go.mod" - -# the version of onflow/flow-go/crypto used in the project is read from the go.mod file -if [ -f "${MOD_FILE}" ] -then - # extract the imported version - VERSION="$(go list -f '{{.Version}}' -m ${PKG_NAME})" - # go get the package - go get "${PKG_NAME}@${VERSION}" || { echo "go get the package failed"; exit 1; } - # using the right version, get the package directory path - PKG_DIR="$(go env GOPATH)/pkg/mod/${PKG_NAME}@${VERSION}" -else - { echo "couldn't find go.mod file - make sure the script is in the project root directory"; exit 1; } -fi - -# grant permissions if not existant -if [[ ! -r ${PKG_DIR} || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then - chmod -R 755 "${PKG_DIR}" -fi - -# get into the package directory and set up the external dependencies -( - cd "${PKG_DIR}" || { echo "cd into the GOPATH package folder failed"; exit 1; } - go generate -) diff --git a/engine/execution/computation/computer/spock_norelic.go b/engine/execution/computation/computer/spock_norelic.go deleted file mode 100644 index 81678d94f33..00000000000 --- a/engine/execution/computation/computer/spock_norelic.go +++ /dev/null @@ -1,26 +0,0 @@ -//go:build !relic -// +build !relic - -package computer - -import ( - "github.com/onflow/flow-go/crypto" - "github.com/onflow/flow-go/crypto/hash" -) - -// This is a temporary wrapper that simulates a call to SPoCK prove, -// required for the emulator build. The function is never called by the -// emulator although it is required for a successful build. -// -// TODO(tarak): remove once the crypto module properly implements a non-relic -// version of SPOCKProve. -func SPOCKProve( - sk crypto.PrivateKey, - data []byte, - kmac hash.Hasher, -) ( - crypto.Signature, - error, -) { - panic("SPoCK prove not supported when flow-go is built without relic") -} diff --git a/engine/execution/computation/computer/spock_relic.go b/engine/execution/computation/computer/spock_relic.go index 89a8182ba8f..0fcb835adcd 100644 --- a/engine/execution/computation/computer/spock_relic.go +++ b/engine/execution/computation/computer/spock_relic.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package computer import ( diff --git a/module/dkg_broker.go b/module/dkg_broker.go index 49ebb0ad051..7e64353816e 100644 --- a/module/dkg_broker.go +++ b/module/dkg_broker.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package module import ( diff --git a/module/signature/aggregation.go b/module/signature/aggregation.go index 99129c656dc..76101ee3805 100644 --- a/module/signature/aggregation.go +++ b/module/signature/aggregation.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package signature import ( diff --git a/module/signature/aggregation_no_relic.go b/module/signature/aggregation_no_relic.go deleted file mode 100644 index 6b51c6f35a3..00000000000 --- a/module/signature/aggregation_no_relic.go +++ /dev/null @@ -1,34 +0,0 @@ -//go:build !relic -// +build !relic - -package signature - -import ( - "github.com/onflow/flow-go/crypto" -) - -const panic_relic = "function only supported with the relic build tag" - -// These functions are the non-relic versions of some public functions from the package. -// The functions are here to allow the build of flow-emulator, since the emulator is built -// without the "relic" build tag, and does not run the functions below. -type SignatureAggregatorSameMessage struct{} - -func NewSignatureAggregatorSameMessage( - message []byte, - dsTag string, - publicKeys []crypto.PublicKey, -) (*SignatureAggregatorSameMessage, error) { - panic(panic_relic) -} - -func (s *SignatureAggregatorSameMessage) Verify(signer int, sig crypto.Signature) (bool, error) { - panic(panic_relic) -} -func (s *SignatureAggregatorSameMessage) TrustedAdd(signer int, sig crypto.Signature) error { - panic(panic_relic) -} - -func (s *SignatureAggregatorSameMessage) Aggregate() ([]int, crypto.Signature, error) { - panic(panic_relic) -} diff --git a/module/signature/aggregation_test.go b/module/signature/aggregation_test.go index aacd0a89f06..4291a7d5734 100644 --- a/module/signature/aggregation_test.go +++ b/module/signature/aggregation_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package signature import ( From 9b17f933ffeeefeefb27e1de7255d149b186861a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 24 May 2023 19:07:36 -0600 Subject: [PATCH 112/200] remove more relic related tags and code --- .github/workflows/bench.yml | 3 --- .github/workflows/cd.yml | 2 -- .github/workflows/flaky-test-debug.yml | 5 ----- .github/workflows/tools.yml | 2 -- cmd/Dockerfile | 4 ++-- .../computation/computer/result_collector.go | 2 +- .../computation/computer/spock_relic.go | 21 ------------------- 7 files changed, 3 insertions(+), 36 deletions(-) delete mode 100644 engine/execution/computation/computer/spock_relic.go diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 7c3c6d896bd..b57bbbd440d 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -44,9 +44,6 @@ jobs: go-version: "1.19" cache: true - - name: Build relic - run: make crypto_setup_gopath - - name: Run benchmark on current branch run: | (for i in {1..${{ steps.settings.outputs.benchmark_repetitions }}}; do go test ./fvm ./engine/execution/computation --bench . -shuffle=on --benchmem --run ^$; done) | tee new.txt diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index eb28e840078..962242cb888 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -17,8 +17,6 @@ jobs: go-version: '1.19' - name: Checkout repo uses: actions/checkout@v2 - - name: Build relic - run: make crypto_setup_gopath # Provide Google Service Account credentials to Github Action, allowing interaction with the Google Container Registry # Logging in as github-actions@dl-flow.iam.gserviceaccount.com - name: Docker login diff --git a/.github/workflows/flaky-test-debug.yml b/.github/workflows/flaky-test-debug.yml index 3e5092c9f07..722b9ed2f4e 100644 --- a/.github/workflows/flaky-test-debug.yml +++ b/.github/workflows/flaky-test-debug.yml @@ -27,8 +27,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Run go generate run: go generate working-directory: ${{ matrix.dir }} @@ -37,7 +35,6 @@ jobs: with: # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version. version: v1.49 - args: -v --build-tags relic working-directory: ${{ matrix.dir }} # https://github.com/golangci/golangci-lint-action/issues/244 skip-cache: true @@ -192,8 +189,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Docker build run: make docker-build-flow docker-build-flow-corrupt - name: Run tests diff --git a/.github/workflows/tools.yml b/.github/workflows/tools.yml index 2e297adb6ff..2cd9ee447a8 100644 --- a/.github/workflows/tools.yml +++ b/.github/workflows/tools.yml @@ -34,8 +34,6 @@ jobs: uses: actions/checkout@v2 with: ref: ${{ inputs.tag }} - - name: Build relic - run: make crypto_setup_gopath - name: Build and upload boot-tools run: | make tool-bootstrap tool-transit diff --git a/cmd/Dockerfile b/cmd/Dockerfile index fc4bcf7badb..4e38b48432f 100644 --- a/cmd/Dockerfile +++ b/cmd/Dockerfile @@ -36,7 +36,7 @@ WORKDIR /app ARG GOARCH=amd64 # TAGS can be overriden to modify the go build tags (e.g. build without netgo) -ARG TAGS="relic,netgo" +ARG TAGS="netgo" # Keep Go's build cache between builds. # https://github.com/golang/go/issues/27719#issuecomment-514747274 @@ -64,7 +64,7 @@ ARG GOARCH=amd64 RUN --mount=type=ssh \ --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=1 GOOS=linux go build --tags "relic,netgo" -ldflags "-extldflags -static \ + CGO_ENABLED=1 GOOS=linux go build --tags "netgo" -ldflags "-extldflags -static \ -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ -gcflags="all=-N -l" -o ./app ${TARGET} diff --git a/engine/execution/computation/computer/result_collector.go b/engine/execution/computation/computer/result_collector.go index dd6a6f90ade..4102d19efb3 100644 --- a/engine/execution/computation/computer/result_collector.go +++ b/engine/execution/computation/computer/result_collector.go @@ -171,7 +171,7 @@ func (collector *resultCollector) commitCollection( spock, err := collector.signer.SignFunc( collectionExecutionSnapshot.SpockSecret, collector.spockHasher, - SPOCKProve) + crypto.SPOCKProve) if err != nil { return fmt.Errorf("signing spock hash failed: %w", err) } diff --git a/engine/execution/computation/computer/spock_relic.go b/engine/execution/computation/computer/spock_relic.go deleted file mode 100644 index 0fcb835adcd..00000000000 --- a/engine/execution/computation/computer/spock_relic.go +++ /dev/null @@ -1,21 +0,0 @@ -package computer - -import ( - "github.com/onflow/flow-go/crypto" - "github.com/onflow/flow-go/crypto/hash" -) - -// This is a temporary wrapper that around the crypto library. -// -// TODO(tarak): remove once the crypto module properly implements a non-relic -// version of SPOCKProve. -func SPOCKProve( - sk crypto.PrivateKey, - data []byte, - kmac hash.Hasher, -) ( - crypto.Signature, - error, -) { - return crypto.SPOCKProve(sk, data, kmac) -} From 57215468aab0d425bef4a5f84b92de9a977563de Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 25 May 2023 00:31:16 -0600 Subject: [PATCH 113/200] remove crypto_setup_gopath --- Makefile | 4 ++-- cmd/Dockerfile | 3 +-- integration/benchnet2/Makefile | 1 - tools/test_monitor/run-tests.sh | 5 +---- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 6d9b2321bab..cd402f40f1e 100644 --- a/Makefile +++ b/Makefile @@ -67,7 +67,7 @@ install-mock-generators: go install github.com/golang/mock/mockgen@v1.6.0; .PHONY: install-tools -install-tools: crypto_setup_gopath check-go-version install-mock-generators +install-tools: check-go-version install-mock-generators cd ${GOPATH}; \ go install github.com/golang/protobuf/protoc-gen-go@v1.3.2; \ go install github.com/uber/prototool/cmd/prototool@v1.9.0; \ @@ -207,7 +207,7 @@ ci: install-tools test # Runs integration tests .PHONY: ci-integration -ci-integration: crypto_setup_gopath +ci-integration: $(MAKE) -C integration ci-integration-test # Runs benchmark tests diff --git a/cmd/Dockerfile b/cmd/Dockerfile index 4e38b48432f..5dbde25bfb4 100644 --- a/cmd/Dockerfile +++ b/cmd/Dockerfile @@ -25,8 +25,7 @@ COPY . . RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ - --mount=type=secret,id=git_creds,dst=/root/.netrc \ - make crypto_setup_gopath + --mount=type=secret,id=git_creds,dst=/root/.netrc #################################### ## (3) Build the production app binary diff --git a/integration/benchnet2/Makefile b/integration/benchnet2/Makefile index f1979c0f1b4..73364e104c2 100644 --- a/integration/benchnet2/Makefile +++ b/integration/benchnet2/Makefile @@ -28,7 +28,6 @@ endif # assumes there is a checked out version of flow-go in a "flow-go" sub-folder at this level so that the bootstrap executable # for the checked out version will be run in the sub folder but the bootstrap folder will be created here (outside of the checked out flow-go in the sub folder) gen-bootstrap: clone-flow - cd flow-go && make crypto_setup_gopath cd flow-go/cmd/bootstrap && go run . genconfig --address-format "%s%d-${NETWORK_ID}.${NAMESPACE}:3569" --access $(ACCESS) --collection $(COLLECTION) --consensus $(CONSENSUS) --execution $(EXECUTION) --verification $(VERIFICATION) --weight 100 -o ./ --config ../../../bootstrap/conf/node-config.json cd flow-go/cmd/bootstrap && go run . keygen --machine-account --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/keys echo {} > ./bootstrap/conf/partner-stakes.json diff --git a/tools/test_monitor/run-tests.sh b/tools/test_monitor/run-tests.sh index 0cbf1383b19..c30085ffc21 100755 --- a/tools/test_monitor/run-tests.sh +++ b/tools/test_monitor/run-tests.sh @@ -23,7 +23,6 @@ then fi echo "preparing $TEST_CATEGORY tests">&2 - make crypto_setup_gopath make docker-build-flow docker-build-flow-corrupt echo "running $TEST_CATEGORY tests">&2 make -C integration -s ${BASH_REMATCH[1]}-tests > test-output @@ -37,10 +36,8 @@ else make -s unittest-main > test-output ;; unit-crypto) - echo "preparing crypto unit tests">&2 - make -C crypto setup echo "running crypto unit tests">&2 - make -C crypto -s unittest > test-output + make -C crypto -s test > test-output ;; unit-insecure) echo "preparing insecure unit tests">&2 From 06572e347c798177e8a46b7289bf412b568c9353 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 25 May 2023 00:48:18 -0600 Subject: [PATCH 114/200] update go generate --- crypto/common.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/crypto/common.go b/crypto/common.go index 7e460cbf6d2..b9e072c9930 100644 --- a/crypto/common.go +++ b/crypto/common.go @@ -8,9 +8,6 @@ import ( //revive:disable:var-naming -// the `go generate` command requires bash scripting, `cmake` and `git`. -//go:generate bash ./build_dependency.sh - const ( // Minimum targeted bits of security. // This is used as a reference but it doesn't mean all implemented primitives provide this minimum. From 33c5e0e266a64d8b6d5a90624e469a3238d553e9 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 26 May 2023 13:55:43 -0600 Subject: [PATCH 115/200] remove cmake install from dockerfile --- cmd/Dockerfile | 2 +- crypto/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/Dockerfile b/cmd/Dockerfile index 5dbde25bfb4..90075485922 100644 --- a/cmd/Dockerfile +++ b/cmd/Dockerfile @@ -6,7 +6,7 @@ FROM golang:1.19-bullseye AS build-setup RUN apt-get update -RUN apt-get -y install cmake zip +RUN apt-get -y install zip ## (2) Setup crypto dependencies FROM build-setup AS build-env diff --git a/crypto/Dockerfile b/crypto/Dockerfile index 37a0b373171..7566ea751b3 100644 --- a/crypto/Dockerfile +++ b/crypto/Dockerfile @@ -2,7 +2,7 @@ FROM golang:1.19-buster RUN apt-get update -RUN apt-get -y install cmake zip +RUN apt-get -y install zip RUN go install github.com/axw/gocov/gocov@latest RUN go install github.com/matm/gocov-html@latest WORKDIR /go/src/flow From 83f42fb5d90758fade9ea721b0719d9c4f6aa581 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 26 May 2023 18:08:58 -0600 Subject: [PATCH 116/200] clean up header files in blst_include.h --- crypto/blst_include.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/crypto/blst_include.h b/crypto/blst_include.h index e408c9c0c70..89966463c61 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -4,12 +4,10 @@ // extra tools to use BLST low level that are needed by the Flow crypto library // eventually this file would replace blst.h -#include "bls12381_utils.h" #include "point.h" #include "fields.h" #include "consts.h" #include "errors.h" -#include "sha256.h" // types used by the Flow crypto library that are imported from BLST // these type definitions are used as an abstraction from BLST internal types @@ -66,8 +64,8 @@ typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ // are represented as a little endian vector of limbs. // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r elements). // `Fr` is defined as a struct to be exportable through cgo to the Go layer. -#define R_BITS 255 -typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // TODO: use Fr_LIMBS +#define R_BITS 255 // equal to Fr_bits in bls12381_utils.h +typedef struct {limb_t limbs[(R_BITS+63)/64];} Fr; // field elements F_p // F_p elements are represented as big numbers reduced modulo `p`. Big numbers From ec2ceb400891ae67bbc9deb38835901bda359513 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 26 May 2023 19:18:38 -0600 Subject: [PATCH 117/200] update boolean usage from bool_t to C type bool --- crypto/bls.go | 2 +- crypto/bls12381_utils.c | 28 ++++++++-------- crypto/bls12381_utils.go | 16 ++++----- crypto/bls12381_utils.h | 27 ++++++++------- crypto/bls_thresholdsign_core.c | 6 ++-- crypto/bls_thresholdsign_include.h | 2 +- crypto/blst_include.h | 54 +++--------------------------- crypto/dkg_core.c | 2 +- crypto/dkg_feldmanvss.go | 4 +-- crypto/dkg_feldmanvssq.go | 5 +-- crypto/dkg_include.h | 2 +- 11 files changed, 54 insertions(+), 94 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index c8650c9dc60..3206a29cdf9 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -347,7 +347,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err } // membership check in G2 - if C.E2_in_G2((*C.E2)(&pk.point)) == (C.ulonglong)(0) { + if !bool(C.E2_in_G2((*C.E2)(&pk.point))) { return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group") } diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index a6b1e5c5e44..07224cd4242 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -38,12 +38,12 @@ const Fr BLS12_381_rR = {{ \ }}; // returns true if a == 0 and false otherwise -bool_t Fr_is_zero(const Fr* a) { +bool Fr_is_zero(const Fr* a) { return bytes_are_zero((const byte*)a, sizeof(Fr)); } // returns true if a == b and false otherwise -bool_t Fr_is_equal(const Fr* a, const Fr* b) { +bool Fr_is_equal(const Fr* a, const Fr* b) { return vec_is_equal(a, b, sizeof(Fr)); } @@ -265,7 +265,7 @@ static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) // Reads a scalar from an array and maps it to Fr using modular reduction. // Input is byte-big-endian as used by the external APIs. // It returns true if scalar is zero and false otherwise. -bool_t map_bytes_to_Fr(Fr* a, const byte* bin, int len) { +bool map_bytes_to_Fr(Fr* a, const byte* bin, int len) { Fr_from_be_bytes(a, bin, len); return Fr_is_zero(a); } @@ -311,7 +311,7 @@ static void Fp_neg(Fp *res, const Fp *a) { // The boolean output is valid whether `a` is in Montgomery form or not, // since montgomery constant `R` is a quadratic residue. // However, the square root is valid only if `a` is in montgomery form. -static bool_t Fp_sqrt_montg(Fp *res, const Fp* a) { +static bool Fp_sqrt_montg(Fp *res, const Fp* a) { return sqrt_fp((limb_t*)res, (limb_t*)a); } @@ -415,7 +415,7 @@ static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { // The boolean output is valid whether `a` is in Montgomery form or not, // since montgomery constant `R` is a quadratic residue. // However, the square root is valid only if `a` is in montgomery form. -static bool_t Fp2_sqrt_montg(Fp2 *res, const Fp2* a) { +static bool Fp2_sqrt_montg(Fp2 *res, const Fp2* a) { return sqrt_fp2((vec384*)res, (vec384*)a); } @@ -466,13 +466,13 @@ void E1_copy(E1* res, const E1* p) { } // checks p1 == p2 -bool_t E1_is_equal(const E1* p1, const E1* p2) { +bool E1_is_equal(const E1* p1, const E1* p2) { // `POINTonE1_is_equal` includes the infinity case return POINTonE1_is_equal((const POINTonE1*)p1, (const POINTonE1*)p2); } // compare p to infinity -bool_t E1_is_infty(const E1* p) { +bool E1_is_infty(const E1* p) { // BLST infinity points are defined by Z=0 return vec_is_zero(p->z, sizeof(p->z)); } @@ -495,14 +495,14 @@ void E1_to_affine(E1* res, const E1* p) { } // checks affine point `p` is in E1 -bool_t E1_affine_on_curve(const E1* p) { +bool E1_affine_on_curve(const E1* p) { // BLST's `POINTonE1_affine_on_curve` does not include the inifity case! return POINTonE1_affine_on_curve((POINTonE1_affine*)p) | E1_is_infty(p); } // checks if input E1 point is on the subgroup G1. // It assumes input `p` is on E1. -bool_t E1_in_G1(const E1* p){ +bool E1_in_G1(const E1* p){ // currently uses Scott method return POINTonE1_in_G1((const POINTonE1*)p); } @@ -859,19 +859,19 @@ void E2_set_infty(E2* p) { } // check if `p` is infinity -bool_t E2_is_infty(const E2* p) { +bool E2_is_infty(const E2* p) { // BLST infinity points are defined by Z=0 return vec_is_zero(p->z, sizeof(p->z)); } // checks affine point `p` is in E2 -bool_t E2_affine_on_curve(const E2* p) { +bool E2_affine_on_curve(const E2* p) { // BLST's `POINTonE2_affine_on_curve` does not include the infinity case! return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p); } // checks p1 == p2 -bool_t E2_is_equal(const E2* p1, const E2* p2) { +bool E2_is_equal(const E2* p1, const E2* p2) { // `POINTonE2_is_equal` includes the infinity case return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2); } @@ -935,7 +935,7 @@ void G2_mult_gen(E2* res, const Fr* expo) { // checks if input E2 point is on the subgroup G2. // It assumes input `p` is on E2. -bool_t E2_in_G2(const E2* p){ +bool E2_in_G2(const E2* p){ // currently uses Scott method return POINTonE2_in_G2((const POINTonE2*)p); } @@ -1084,7 +1084,7 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { // ------------------- Pairing utilities -bool_t Fp12_is_one(Fp12 *a) { +bool Fp12_is_one(Fp12 *a) { return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12)); } diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 9695d45aba2..94083cf9abe 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -89,28 +89,28 @@ func generatorScalarMultG2(res *pointE2, expo *scalar) { // comparison in Fr where r is the group order of G1/G2 // (both scalars should be reduced mod r) func (x *scalar) equals(other *scalar) bool { - return C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other)) != 0 + return bool(C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other))) } // comparison in E1 func (p *pointE1) equals(other *pointE1) bool { - return C.E1_is_equal((*C.E1)(p), (*C.E1)(other)) != 0 + return bool(C.E1_is_equal((*C.E1)(p), (*C.E1)(other))) } // comparison in E2 func (p *pointE2) equals(other *pointE2) bool { - return C.E2_is_equal((*C.E2)(p), (*C.E2)(other)) != 0 + return bool(C.E2_is_equal((*C.E2)(p), (*C.E2)(other))) } // Comparison to zero in Fr. // Scalar must be already reduced modulo r func (x *scalar) isZero() bool { - return C.Fr_is_zero((*C.Fr)(x)) != 0 + return bool(C.Fr_is_zero((*C.Fr)(x))) } // Comparison to point at infinity in G2. func (p *pointE2) isInfinity() bool { - return C.E2_is_infty((*C.E2)(p)) != 0 + return bool(C.E2_is_infty((*C.E2)(p))) } // generates a random element in F_r using input random source, @@ -142,7 +142,7 @@ func mapToFr(x *scalar, src []byte) bool { isZero := C.map_bytes_to_Fr((*C.Fr)(x), (*C.uchar)(&src[0]), (C.int)(len(src))) - return isZero != (C.ulonglong)(0) + return bool(isZero) } // writeScalar writes a scalar in a slice of bytes @@ -231,13 +231,13 @@ func readPointE1(a *pointE1, src []byte) error { // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used // in go test files. func checkMembershipG1(pt *pointE1) bool { - return C.E1_in_G1((*C.E1)(pt)) != (C.ulonglong)(0) + return bool(C.E1_in_G1((*C.E1)(pt))) } // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used // in go test files. func checkMembershipG2(pt *pointE2) bool { - return C.E2_in_G2((*C.E2)(pt)) != (C.ulonglong)(0) + return bool(C.E2_in_G2((*C.E2)(pt))) } // This is only a TEST/DEBUG/BENCH function. diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index d2f2d8b489f..3e8ca1b06ea 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -8,6 +8,9 @@ #include #include "blst_include.h" +typedef uint8_t byte; +typedef _Bool bool; // assuming cgo is using a modern enough compiler + #define SEC_BITS 128 #define VALID 0 #define INVALID 1 @@ -50,8 +53,8 @@ int bls_spock_verify(const E2*, const byte*, const E2*, const byte*); // Fr utilities extern const Fr BLS12_381_rR; -bool_t Fr_is_zero(const Fr* a); -bool_t Fr_is_equal(const Fr* a, const Fr* b); +bool Fr_is_zero(const Fr* a); +bool Fr_is_equal(const Fr* a, const Fr* b); void Fr_set_limb(Fr*, const limb_t); void Fr_copy(Fr*, const Fr*); void Fr_set_zero(Fr*); @@ -69,7 +72,7 @@ void Fr_inv_exp_montg(Fr *res, const Fr *a); BLST_ERROR Fr_read_bytes(Fr* a, const byte *bin, int len); BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len); void Fr_write_bytes(byte *bin, const Fr* a); -bool_t map_bytes_to_Fr(Fr*, const byte*, int); +bool map_bytes_to_Fr(Fr*, const byte*, int); // Fp utilities void Fp_mul_montg(Fp *, const Fp *, const Fp *); @@ -77,12 +80,12 @@ void Fp_squ_montg(Fp *, const Fp *); // E1 and G1 utilities void E1_copy(E1*, const E1*); -bool_t E1_is_equal(const E1*, const E1*); +bool E1_is_equal(const E1*, const E1*); void E1_set_infty(E1*); -bool_t E1_is_infty(const E1*); +bool E1_is_infty(const E1*); void E1_to_affine(E1*, const E1*); -bool_t E1_affine_on_curve(const E1*); -bool_t E1_in_G1(const E1*); +bool E1_affine_on_curve(const E1*); +bool E1_in_G1(const E1*); void E1_mult(E1*, const E1*, const Fr*); void E1_add(E1*, const E1*, const E1*); void E1_neg(E1*, const E1*); @@ -99,9 +102,9 @@ int map_to_G1(E1*, const byte*, const int); // E2 and G2 utilities void E2_set_infty(E2* p); -bool_t E2_is_infty(const E2*); -bool_t E2_affine_on_curve(const E2*); -bool_t E2_is_equal(const E2*, const E2*); +bool E2_is_infty(const E2*); +bool E2_affine_on_curve(const E2*); +bool E2_is_equal(const E2*, const E2*); void E2_copy(E2*, const E2*); void E2_to_affine(E2*, const E2*); BLST_ERROR E2_read_bytes(E2*, const byte *, const int); @@ -113,12 +116,12 @@ void E2_add(E2* res, const E2* a, const E2* b); void E2_neg(E2*, const E2*); void E2_sum_vector(E2*, const E2*, const int); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); -bool_t E2_in_G2(const E2*); +bool E2_in_G2(const E2*); void unsafe_map_bytes_to_G2(E2*, const byte*, int); BLST_ERROR unsafe_map_bytes_to_G2complement(E2*, const byte*, int); // pairing and Fp12 -bool_t Fp12_is_one(Fp12*); +bool Fp12_is_one(Fp12*); void Fp12_set_one(Fp12*); void multi_pairing(Fp12*, const E1*, const E2*, const int); diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index e160a16e7c9..78a87823b4c 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -7,7 +7,7 @@ // Computes the Lagrange coefficient L_i(0) in Fr with regards to the range [indices(0)..indices(t)] // and stores it in `res`, where t is the degree of the polynomial P. // `len` is equal to `t+1` where `t` is the polynomial degree. -static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indices[], const int len){ +static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const byte indices[], const int len){ // coefficient is computed as N * D^(-1) Fr numerator; // eventually would represent N*R^k @@ -63,7 +63,7 @@ static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const uint8_t indice // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the indices [indices(0)..indices(t)] // and their G1 images [shares(0)..shares(t)], and stores the resulting G1 point in `dest`. // `len` is equal to `t+1` where `t` is the polynomial degree. -static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const uint8_t indices[], const int len) { +static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const byte indices[], const int len) { // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1 // where A_i = g1 ^ a_i @@ -83,7 +83,7 @@ static void E1_lagrange_interpolate_at_zero(E1* out, const E1 shares[], const ui // Computes the Langrange interpolation at zero LI(0) with regards to the indices [indices(0)..indices(t)] // and writes their E1 concatenated serializations [shares(1)..shares(t+1)] in `dest`. // `len` is equal to `t+1` where `t` is the polynomial degree. -int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const uint8_t indices[], const int len) { +int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const byte indices[], const int len) { int read_ret; E1* E1_shares = malloc(sizeof(E1) * len); for (int i=0; i < len; i++) { diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index 1275b10bab4..3937f8ce965 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -3,7 +3,7 @@ #include "bls_include.h" -int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const uint8_t[], const int); +int E1_lagrange_interpolate_at_zero_write(byte*, const byte* , const byte[], const int); extern void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int a_size, const byte x); #endif diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 89966463c61..20c2fcad5df 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -1,63 +1,19 @@ #ifndef __BLST_INCLUDE_H__ #define __BLST_INCLUDE_H__ -// extra tools to use BLST low level that are needed by the Flow crypto library -// eventually this file would replace blst.h - +// BLST src headers #include "point.h" #include "fields.h" #include "consts.h" -#include "errors.h" - -// types used by the Flow crypto library that are imported from BLST -// these type definitions are used as an abstraction from BLST internal types - -// Parts of this file have been copied from blst.h in the BLST repo -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifdef __SIZE_TYPE__ -typedef __SIZE_TYPE__ size_t; -#else -#include -#endif - -#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ - && defined(__UINT64_TYPE__) -typedef __UINT8_TYPE__ uint8_t; -typedef __UINT32_TYPE__ uint32_t; -typedef __UINT64_TYPE__ uint64_t; -#else -#include -#endif - -typedef uint8_t byte; - -#ifdef __cplusplus -extern "C" { -#elif defined(__BLST_CGO__) -typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ -#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 -# define bool _Bool -#else -# define bool int -#endif - -#ifdef SWIG -# define DEFNULL =NULL -#elif defined __cplusplus -# define DEFNULL =0 -#else -# define DEFNULL -#endif +#include "errors.h" // TODO: add sanity checks that BLST_PK_IS_INFINITY is indeed the last // enum value (eventually submit a fix to BLST) #define BLST_BAD_SCALAR ((BLST_PK_IS_INFINITY)+1) +// types used by the Flow crypto library that are imported from BLST +// these type definitions are used as an abstraction from BLST internal types + // field elements F_r // where `r` is the order of G1/G2. // F_r elements are represented as big numbers reduced modulo `r`. Big numbers diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 2b34572089c..9966fbcfc37 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -78,7 +78,7 @@ BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){ // checks the discrete log relationship in G2. // - returns 1 if g2^x = y, where g2 is the generator of G2 // - returns 0 otherwise. -bool_t G2_check_log(const Fr* x, const E2* y) { +bool G2_check_log(const Fr* x, const E2* y) { E2 tmp; G2_mult_gen(&tmp, x); return E2_is_equal(&tmp, y); diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index dd81bbcd79c..98420cc87cf 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -473,9 +473,9 @@ func readVerifVector(A []pointE2, src []byte) error { func (s *feldmanVSSstate) verifyShare() bool { // check y[current] == x.G2 - return C.G2_check_log( + return bool(C.G2_check_log( (*C.Fr)(&s.x), - (*C.E2)(&s.y[s.myIndex])) != 0 + (*C.E2)(&s.y[s.myIndex]))) } // computePublicKeys extracts the participants public keys from the verification vector diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 620c962faaa..a7de2fe93d9 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -511,9 +511,10 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index) // - true if the complaint answer is not correct func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool { // check y[complainer] == share.G2 - return C.G2_check_log( + isLog := C.G2_check_log( (*C.Fr)(&c.answer), - (*C.E2)(&s.y[complainer])) == 0 + (*C.E2)(&s.y[complainer])) + return !bool(isLog) } // data = |complainee| diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index ca6619eb10f..8d3bdc7e1d7 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -8,6 +8,6 @@ void Fr_polynomial_image(Fr* out, E2* y, const Fr* a, const int deg, cons void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int deg); void G2_vector_write_bytes(byte* out, const E2* A, const int len); BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len); -bool_t G2_check_log(const Fr* x, const E2* y); +bool G2_check_log(const Fr* x, const E2* y); #endif From fb4ac123b967fdbe4d3f1b676d6f299367a81ecf Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sat, 27 May 2023 22:11:45 -0600 Subject: [PATCH 118/200] add sanity check scalar mult in G1 and G2 --- crypto/bls12381_utils_test.go | 36 +++++++++++++++++++++++++++++++++++ crypto/ecdsa_test.go | 2 +- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 69d7e687f9b..17a1526414a 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -9,6 +9,42 @@ import ( "github.com/stretchr/testify/require" ) +// Sanity check of G1 and G2 scalar multiplication +func TestScalarMultBLS12381(t *testing.T) { + expoBytes, err := hex.DecodeString("444465cb6cc2dba9474e6beeb6a9013fbf1260d073429fb14a31e63e89129390") + require.NoError(t, err) + + var expo scalar + isZero := mapToFr(&expo, expoBytes) + require.False(t, isZero) + + // G1 generator multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm + t.Run("G1", func(t *testing.T) { + var p pointE1 + generatorScalarMultG1(&p, &expo) + expected, err := hex.DecodeString("96484ca50719f5d2533047960878b6bae8289646c0f00a942a1e6992be9981a9e0c7a51e9918f9b19d178cf04a8018a4") + require.NoError(t, err) + pBytes := make([]byte, SignatureLenBLSBLS12381) + writePointE1(pBytes, &p) + assert.Equal(t, pBytes, expected) + }) + + // G2 generator multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm + t.Run("G2", func(t *testing.T) { + var p pointE2 + generatorScalarMultG2(&p, &expo) + expected, err := hex.DecodeString("b35f5043f166848805b98da62dcb9c5d2f25e497bd0d9c461d4a00d19e4e67cc1e813de3c99479d5a2c62fb754fd7df40c4fd60c46834c8ae665343a3ff7dc3cc929de34ad62b7b55974f4e3fd20990d3e564b96e4d33de87716052d58cf823e") + require.NoError(t, err) + pBytes := make([]byte, PubKeyLenBLSBLS12381) + writePointE2(pBytes, &p) + assert.Equal(t, pBytes, expected) + }) +} + // G1 and G2 scalar multiplication func BenchmarkScalarMult(b *testing.B) { seed := make([]byte, securityBits/8) diff --git a/crypto/ecdsa_test.go b/crypto/ecdsa_test.go index d5d38f8e947..6a69453816d 100644 --- a/crypto/ecdsa_test.go +++ b/crypto/ecdsa_test.go @@ -157,7 +157,7 @@ func TestECDSAUtils(t *testing.T) { // TestScalarMult is a unit test of the scalar multiplication // This is only a sanity check meant to make sure the curve implemented // is checked against an independant test vector -func TestScalarMult(t *testing.T) { +func TestScalarMultP256_secp256k1(t *testing.T) { secp256k1 := secp256k1Instance.curve p256 := p256Instance.curve genericMultTests := []struct { From 0232a953b2040dda260de7aa4c761f8404efb1e5 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sun, 28 May 2023 00:27:37 -0600 Subject: [PATCH 119/200] use not enough shares error in BLSReconstructThresholdSignature --- crypto/bls_thresholdsign.go | 14 +++++++++----- crypto/bls_thresholdsign_test.go | 9 +++++++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 3cef4d4e605..a7eaad5a2a4 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -437,10 +437,14 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat // // size is the number of participants, it must be in the range [ThresholdSignMinSize..ThresholdSignMaxSize]. // threshold is the threshold value, it must be in the range [MinimumThreshold..size-1]. -// The function does not check the validity of the shares, and does not check -// the validity of the resulting signature. +// The function does not accept any input public key. Therefore, it does not check the validity of the +// shares against individual public keys, and does not check the validity of the resulting signature +// against the group public key. // BLSReconstructThresholdSignature returns: -// - (nil, error) if the inputs are not in the correct range, if the threshold is not reached +// - (nil, invalidInputsError) if : +// -- numbers of shares does not match the number of signers +// -- the inputs are not in the correct range. +// - (nil, notEnoughSharesError) if the threshold is not reached. // - (nil, duplicatedSignerError) if input signers are not distinct. // - (nil, invalidSignatureError) if at least one of the first (threshold+1) signatures. // does not serialize to a valid E1 point. @@ -470,8 +474,8 @@ func BLSReconstructThresholdSignature(size int, threshold int, } if len(shares) < threshold+1 { - return nil, invalidInputsErrorf( - "the number of signatures does not reach the threshold") + return nil, notEnoughSharesErrorf( + "the number of signatures %d is less than the minimum %d", len(shares), threshold+1) } // map to check signers are distinct diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 3e55f3d1806..20d578db264 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -594,9 +594,14 @@ func testCentralizedStatelessAPI(t *testing.T) { signers[randomDuplicate] = tmp } + // check with not enough signatures + thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares[:threshold], signers[:threshold]) + assert.Error(t, err) + assert.True(t, IsNotEnoughSharesError(err)) + assert.Nil(t, thresholdSignature) + // check with an invalid signature (invalid serialization) - invalidSig := make([]byte, signatureLengthBLSBLS12381) - signShares[0] = invalidSig + signShares[0] = BLSInvalidSignature() thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) assert.Error(t, err) assert.True(t, IsInvalidSignatureError(err)) From 5fa28df293e2490da9b77d9df86edb492ced274a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Sun, 28 May 2023 01:47:16 -0600 Subject: [PATCH 120/200] refactor BLS constants to use internal BLS12_381 length constants --- crypto/bls.go | 35 ++++++++++++++--------------------- crypto/bls12381_utils.c | 12 ++++++++++++ crypto/bls12381_utils.go | 14 +++++++++----- crypto/bls12381_utils.h | 3 +++ crypto/bls12381_utils_test.go | 10 +++++----- crypto/bls_crossBLST_test.go | 2 +- crypto/dkg_feldmanvss.go | 8 ++++---- crypto/dkg_feldmanvssq.go | 4 ++-- 8 files changed, 50 insertions(+), 38 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 3206a29cdf9..65c7ce4d390 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -39,24 +39,16 @@ import ( "github.com/onflow/flow-go/crypto/hash" ) -const ( - // BLS12-381 - // p size in bytes, where G1 is defined over the field Zp - fieldSize = 48 - // - // 1 for compressed, 0 for uncompressed - values should not be changed - uncompressed = 0 //nolint - compressed = 1 - // Points compression when serialized - serializationG1 = compressed - serializationG2 = compressed - // - // SignatureLenBLSBLS12381 is the size of G1 elements - SignatureLenBLSBLS12381 = fieldSize * (2 - serializationG1) // the length is divided by 2 if compression is on - PrKeyLenBLSBLS12381 = 32 // equal to frBytesLen - // PubKeyLenBLSBLS12381 is the size of G2 elements - PubKeyLenBLSBLS12381 = 2 * fieldSize * (2 - serializationG2) // the length is divided by 2 if compression is on +var ( + // SignatureLenBLSBLS12381 is the size of a `G_1` element. + SignatureLenBLSBLS12381 = g1BytesLen + // PubKeyLenBLSBLS12381 is the size of a `G_2` element. + PubKeyLenBLSBLS12381 = g2BytesLen + // PrKeyLenBLSBLS12381 is the size of a `F_r` element, where `r` is the order of `G_1` and `G_2`. + PrKeyLenBLSBLS12381 = frBytesLen +) +const ( // Hash to curve params // hash to curve suite ID of the form : CurveID_ || HashID_ || MapID_ || encodingVariant_ h2cSuiteID = "BLS12381G1_XOF:KMAC128_SSWU_RO_" @@ -70,8 +62,7 @@ const ( ) // expandMsgOutput is the output length of the expand_message step as required by the -// hash_to_curve algorithm (and the map to G1 step) -// +// hash_to_curve algorithm (and the map to G1 step). // (Cgo does not export C macros) var expandMsgOutput = int(C.get_mapToG1_input_len()) @@ -360,7 +351,8 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err // decodePublicKeyCompressed decodes a slice of bytes into a public key. // since we use the compressed representation by default, this checks the default and delegates to decodePublicKeyCompressed func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (PublicKey, error) { - if serializationG2 != compressed { + // in compression mode, g2BytesLen is equal to 2 * Fp_bytes + if g2BytesLen != 2*fpBytesLen { panic("library is not configured to use compressed public key serialization") } return a.decodePublicKey(publicKeyBytes) @@ -490,7 +482,8 @@ func (pk *pubKeyBLSBLS12381) Size() int { // The encoding is a compressed encoding of the point // [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte { - if serializationG2 != compressed { + // in compression mode, g2BytesLen is equal to 2 * Fp_bytes + if g2BytesLen != 2*fpBytesLen { panic("library is not configured to use compressed public key serialization") } dest := make([]byte, pubKeyLengthBLSBLS12381) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 07224cd4242..00d32ccdfed 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -24,6 +24,18 @@ int get_Fr_BYTES() { return Fr_BYTES; } +int get_Fp_BYTES() { + return Fp_BYTES; +} + +int get_G1_SER_BYTES() { + return G1_SER_BYTES; +} + +int get_G2_SER_BYTES() { + return G2_SER_BYTES; +} + int get_mapToG1_input_len() { return MAP_TO_G1_INPUT_LEN; } diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 94083cf9abe..f0a6c77a12d 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -44,8 +44,12 @@ type scalar C.Fr // TODO: For now scalars are represented as field elements Fr since all scalars // are less than r - check if distinguishing two types in necessary -// BLS12-381 related lengths +// BLS12-381 related lengths, exported as functions +// because cgo does not recognize C macros. var frBytesLen = int(C.get_Fr_BYTES()) +var g1BytesLen = int(C.get_G1_SER_BYTES()) +var g2BytesLen = int(C.get_G2_SER_BYTES()) +var fpBytesLen = int(C.get_Fp_BYTES()) // get some constants from the C layer // (Cgo does not export C macros) @@ -151,14 +155,14 @@ func writeScalar(dest []byte, x *scalar) { } // writePointE2 writes a G2 point in a slice of bytes -// The slice should be of size PubKeyLenBLSBLS12381 and the serialization +// The slice should be of size g2BytesLen and the serialization // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves func writePointE2(dest []byte, a *pointE2) { C.E2_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(a)) } // writePointE1 writes a G1 point in a slice of bytes -// The slice should be of size SignatureLenBLSBLS12381 and the serialization +// The slice should be of size g1BytesLen and the serialization // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves func writePointE1(dest []byte, a *pointE1) { C.E1_write_bytes((*C.uchar)(&dest[0]), (*C.E1)(a)) @@ -187,7 +191,7 @@ func readScalarFrStar(a *scalar, src []byte) error { } // readPointE2 reads a E2 point from a slice of bytes -// The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization +// The slice is expected to be of size g2BytesLen and the deserialization // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. // No G2 membership check is performed. func readPointE2(a *pointE2, src []byte) error { @@ -208,7 +212,7 @@ func readPointE2(a *pointE2, src []byte) error { } // readPointE1 reads a E1 point from a slice of bytes -// The slice should be of size SignatureLenBLSBLS12381 and the deserialization +// The slice should be of size g1BytesLen and the deserialization // follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. // No G1 membership check is performed. func readPointE1(a *pointE1, src []byte) error { diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 3e8ca1b06ea..f3800e6ebef 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -46,6 +46,9 @@ typedef _Bool bool; // assuming cgo is using a modern enough compiler int get_valid(); int get_invalid(); int get_Fr_BYTES(); +int get_Fp_BYTES(); +int get_G1_SER_BYTES(); +int get_G2_SER_BYTES(); int get_mapToG1_input_len(); // BLS based SPoCK diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 17a1526414a..2c9d76bbbe5 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -26,7 +26,7 @@ func TestScalarMultBLS12381(t *testing.T) { generatorScalarMultG1(&p, &expo) expected, err := hex.DecodeString("96484ca50719f5d2533047960878b6bae8289646c0f00a942a1e6992be9981a9e0c7a51e9918f9b19d178cf04a8018a4") require.NoError(t, err) - pBytes := make([]byte, SignatureLenBLSBLS12381) + pBytes := make([]byte, g1BytesLen) writePointE1(pBytes, &p) assert.Equal(t, pBytes, expected) }) @@ -39,7 +39,7 @@ func TestScalarMultBLS12381(t *testing.T) { generatorScalarMultG2(&p, &expo) expected, err := hex.DecodeString("b35f5043f166848805b98da62dcb9c5d2f25e497bd0d9c461d4a00d19e4e67cc1e813de3c99479d5a2c62fb754fd7df40c4fd60c46834c8ae665343a3ff7dc3cc929de34ad62b7b55974f4e3fd20990d3e564b96e4d33de87716052d58cf823e") require.NoError(t, err) - pBytes := make([]byte, PubKeyLenBLSBLS12381) + pBytes := make([]byte, g2BytesLen) writePointE2(pBytes, &p) assert.Equal(t, pBytes, expected) }) @@ -130,7 +130,7 @@ func BenchmarkMapToG1(b *testing.B) { // test subgroup membership check in G1 and G2 func TestSubgroupCheck(t *testing.T) { prg := getPRG(t) - seed := make([]byte, PubKeyLenBLSBLS12381) + seed := make([]byte, g2BytesLen) _, err := prg.Read(seed) require.NoError(t, err) @@ -165,7 +165,7 @@ func TestSubgroupCheck(t *testing.T) { // subgroup membership check bench func BenchmarkSubgroupCheck(b *testing.B) { - seed := make([]byte, PubKeyLenBLSBLS12381) + seed := make([]byte, g2BytesLen) _, err := mrand.Read(seed) require.NoError(b, err) @@ -195,7 +195,7 @@ func BenchmarkSubgroupCheck(b *testing.B) { func TestReadWriteG1(t *testing.T) { prg := getPRG(t) seed := make([]byte, frBytesLen) - bytes := make([]byte, SignatureLenBLSBLS12381) + bytes := make([]byte, g1BytesLen) // generate a random G1 point, encode it, decode it, // and compare it the original point iterations := 50 diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index 6d3f1765e25..7629289ba9e 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -134,7 +134,7 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { // testEncodeDecodeG1CrossBLST tests encoding and decoding of G1 points are consistent with BLST. // This test assumes signature serialization is identical to BLST. func testEncodeDecodeG1CrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), SignatureLenBLSBLS12381, SignatureLenBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), g1BytesLen, g1BytesLen) validSignatureFlow := rapid.Custom(validSignatureBytesFlow) validSignatureBLST := rapid.Custom(validSignatureBytesBLST) // sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 98420cc87cf..ac76469f962 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -151,11 +151,11 @@ func (s *feldmanVSSstate) End() (PrivateKey, PublicKey, []PublicKey, error) { return x, Y, y, nil } -const ( - shareSize = PrKeyLenBLSBLS12381 +var ( + shareSize = frBytesLen // the actual verifVectorSize depends on the state and is: - // PubKeyLenBLSBLS12381*(t+1) - verifVectorSize = PubKeyLenBLSBLS12381 + // g2BytesLen*(t+1) + verifVectorSize = g2BytesLen ) // HandleBroadcastMsg processes a new broadcasted message received by the current participant. diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index a7de2fe93d9..b8056b990dc 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -201,9 +201,9 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error) return x, Y, y, nil } -const ( +var ( complaintSize = 1 - complaintAnswerSize = 1 + PrKeyLenBLSBLS12381 + complaintAnswerSize = 1 + frBytesLen ) // HandleBroadcastMsg processes a new broadcasted message received by the current participant. From d2c7cbf2eb3172f6871cefd1eb7a579d61ae2bc9 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 29 May 2023 13:52:29 -0600 Subject: [PATCH 121/200] more consolidation of length constants --- crypto/bls.go | 17 ++++++----------- crypto/bls12381_utils.go | 4 ++-- crypto/bls_core.c | 21 ++++----------------- crypto/bls_crossBLST_test.go | 4 ++-- crypto/bls_include.h | 11 ----------- crypto/bls_multisig.go | 12 ++++++------ crypto/bls_test.go | 4 ++-- crypto/bls_thresholdsign.go | 8 ++++---- crypto/spock.go | 2 +- 9 files changed, 27 insertions(+), 56 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 65c7ce4d390..f515a9445dc 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -184,7 +184,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) return false, err } - if len(s) != signatureLengthBLSBLS12381 { + if len(s) != SignatureLenBLSBLS12381 { return false, nil } @@ -214,7 +214,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) // 0xC0 is the header of the point at infinity serialization (either in G1 or G2) const infinityPointHeader = byte(0xC0) -var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, signatureLengthBLSBLS12381-1)...) +var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, SignatureLenBLSBLS12381-1)...) // IsBLSSignatureIdentity checks whether the input signature is // the identity signature (point at infinity in G1). @@ -327,9 +327,9 @@ func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, // a faster check during signature verifications. Any verification against an identity // public key outputs `false`. func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, error) { - if len(publicKeyBytes) != pubKeyLengthBLSBLS12381 { + if len(publicKeyBytes) != PubKeyLenBLSBLS12381 { return nil, invalidInputsErrorf("input length must be %d, got %d", - pubKeyLengthBLSBLS12381, len(publicKeyBytes)) + PubKeyLenBLSBLS12381, len(publicKeyBytes)) } var pk pubKeyBLSBLS12381 err := readPointE2(&pk.point, publicKeyBytes) @@ -415,7 +415,7 @@ func (sk *prKeyBLSBLS12381) PublicKey() PublicKey { // Encode returns a byte encoding of the private key. // The encoding is a raw encoding in big endian padded to the group order func (a *prKeyBLSBLS12381) Encode() []byte { - dest := make([]byte, prKeyLengthBLSBLS12381) + dest := make([]byte, frBytesLen) writeScalar(dest, &a.scalar) return dest } @@ -486,7 +486,7 @@ func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte { if g2BytesLen != 2*fpBytesLen { panic("library is not configured to use compressed public key serialization") } - dest := make([]byte, pubKeyLengthBLSBLS12381) + dest := make([]byte, g2BytesLen) writePointE2(dest, &a.point) return dest } @@ -511,11 +511,6 @@ func (pk *pubKeyBLSBLS12381) String() string { return pk.point.String() } -// Get Macro definitions from the C layer as Cgo does not export macros -var signatureLengthBLSBLS12381 = int(C.get_signature_len()) -var pubKeyLengthBLSBLS12381 = int(C.get_pk_len()) -var prKeyLengthBLSBLS12381 = int(C.get_sk_len()) - // This is only a TEST function. // signWithXMDSHA256 signs a message using XMD_SHA256 as a hash to field. // diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index f0a6c77a12d..0cc7e75a509 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -70,7 +70,7 @@ func (a *scalar) String() string { } func (p *pointE2) String() string { - encoding := make([]byte, pubKeyLengthBLSBLS12381) + encoding := make([]byte, g2BytesLen) writePointE2(encoding, p) return fmt.Sprintf("%#x", encoding) } @@ -307,7 +307,7 @@ func hashToG1Bytes(data, dst []byte) []byte { } // serialize the point - pointBytes := make([]byte, signatureLengthBLSBLS12381) + pointBytes := make([]byte, g1BytesLen) writePointE1(pointBytes, &point) return pointBytes } diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 6711320cf51..e1578a150fe 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -4,19 +4,6 @@ // The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) -// functions to export macros to the Go layer (because cgo does not import macros) -int get_signature_len() { - return SIGNATURE_LEN; -} - -int get_pk_len() { - return PK_LEN; -} - -int get_sk_len() { - return SK_LEN; -} - // Computes a BLS signature from a G1 point and writes it in `out`. // `out` must be allocated properly with `G1_SER_BYTES` bytes. static void bls_sign_E1(byte* out, const Fr* sk, const E1* h) { @@ -93,7 +80,7 @@ int bls_verifyPerDistinctMessage(const byte* sig, if (!elemsG2) goto outG2; // elemsG1[0] = sig - if (E1_read_bytes(&elemsG1[0], sig, SIGNATURE_LEN) != BLST_SUCCESS) { + if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != BLST_SUCCESS) { ret = INVALID; goto out; } @@ -167,7 +154,7 @@ int bls_verifyPerDistinctKey(const byte* sig, if (!elemsG2) goto outG2; // elemsG1[0] = s - if (E1_read_bytes(&elemsG1[0], sig, SIGNATURE_LEN) != BLST_SUCCESS) { + if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != BLST_SUCCESS) { ret = INVALID; goto out; } @@ -243,7 +230,7 @@ int bls_verifyPerDistinctKey(const byte* sig, int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int hash_len) { E1 s, h; // deserialize the signature into a curve point - if (E1_read_bytes(&s, sig, SIGNATURE_LEN) != BLST_SUCCESS) { + if (E1_read_bytes(&s, sig, G1_SER_BYTES) != BLST_SUCCESS) { return INVALID; } @@ -393,7 +380,7 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, // the tree aggregations remain valid. // - valid points are multiplied by a random scalar (same for public keys at same index) // to make sure a signature at index (i) is verified against the public key at the same index. - int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN); + int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES); if (read_ret != BLST_SUCCESS || !E1_in_G1(&sigs[i])) { // set signature and key to infinity (no effect on the aggregation tree) // and set result to invalid (result won't be overwritten) diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index 7629289ba9e..3b3939eaf6c 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -80,7 +80,7 @@ func validSignatureBytesBLST(t *rapid.T) []byte { // testEncodeDecodePrivateKeyCrossBLST tests encoding and decoding of private keys are consistent with BLST. // This test assumes private key serialization is identical to the one in BLST. func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), prKeyLengthBLSBLS12381, prKeyLengthBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), PrKeyLenBLSBLS12381, PrKeyLenBLSBLS12381) validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow) validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST) // skBytes are bytes of either a valid or a random private key @@ -154,7 +154,7 @@ func testEncodeDecodeG1CrossBLST(t *rapid.T) { // check both serializations of G1 points are equal if flowPass && blstPass { - sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381) + sigFlowOutBytes := make([]byte, g1BytesLen) writePointE1(sigFlowOutBytes, &pointFlow) sigBLSTOutBytes := pointBLST.Compress() assert.Equal(t, sigFlowOutBytes, sigBLSTOutBytes) diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 4b8e1075501..1ca61b376c4 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -5,19 +5,8 @@ #include "bls12381_utils.h" -// Signature, Public key and Private key lengths -#define FULL_SIGNATURE_LEN G1_BYTES -#define FULL_PK_LEN G2_BYTES -#define SIGNATURE_LEN (FULL_SIGNATURE_LEN/(G1_SERIALIZATION+1)) -#define PK_LEN (FULL_PK_LEN/(G2_SERIALIZATION+1)) -#define SK_BITS (Fr_BITS) -#define SK_LEN BITS_TO_BYTES(SK_BITS) // bls core (functions in bls_core.c) -int get_signature_len(); -int get_pk_len(); -int get_sk_len(); - int bls_sign(byte*, const Fr*, const byte*, const int); int bls_verify(const E2*, const byte*, const byte*, const int); int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index 5714b7e2a34..6c99ae461e2 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -102,14 +102,14 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) { } // flatten the shares (required by the C layer) - flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs)) + flatSigs := make([]byte, 0, SignatureLenBLSBLS12381*len(sigs)) for i, sig := range sigs { - if len(sig) != signatureLengthBLSBLS12381 { + if len(sig) != SignatureLenBLSBLS12381 { return nil, fmt.Errorf("signature at index %d has an invalid length: %w", i, invalidSignatureError) } flatSigs = append(flatSigs, sig...) } - aggregatedSig := make([]byte, signatureLengthBLSBLS12381) + aggregatedSig := make([]byte, SignatureLenBLSBLS12381) // add the points in the C layer result := C.E1_sum_vector_byte( @@ -325,7 +325,7 @@ func VerifyBLSSignatureManyMessages( ) (bool, error) { // check signature length - if len(s) != signatureLengthBLSBLS12381 { + if len(s) != SignatureLenBLSBLS12381 { return false, nil } // check the list lengths @@ -494,7 +494,7 @@ func BatchVerifyBLSSignaturesOneMessage( } // flatten the shares (required by the C layer) - flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs)) + flatSigs := make([]byte, 0, SignatureLenBLSBLS12381*len(sigs)) pkPoints := make([]pointE2, 0, len(pks)) getIdentityPoint := func() pointE2 { @@ -508,7 +508,7 @@ func BatchVerifyBLSSignaturesOneMessage( return falseSlice, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) } - if len(sigs[i]) != signatureLengthBLSBLS12381 || pkBLS.isIdentity { + if len(sigs[i]) != SignatureLenBLSBLS12381 || pkBLS.isIdentity { // case of invalid signature: set the signature and public key at index `i` // to identities so that there is no effect on the aggregation tree computation. // However, the boolean return for index `i` is set to `false` and won't be overwritten. diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 801af0a24a5..d8561ccc5f6 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -411,7 +411,7 @@ func TestBLSAggregateSignatures(t *testing.T) { assert.False(t, result) // test with a signature of a wrong length - shortSig := sigs[0][:signatureLengthBLSBLS12381-1] + shortSig := sigs[0][:SignatureLenBLSBLS12381-1] aggSig, err = AggregateBLSSignatures([]Signature{shortSig}) assert.Error(t, err) assert.True(t, IsInvalidSignatureError(err)) @@ -1199,7 +1199,7 @@ func TestBLSIdentity(t *testing.T) { sk := randomSK(t, rand) sig, err := sk.Sign(msg, hasher) require.NoError(t, err) - oppositeSig := make([]byte, signatureLengthBLSBLS12381) + oppositeSig := make([]byte, SignatureLenBLSBLS12381) copy(oppositeSig, sig) negatePoint(oppositeSig) aggSig, err := AggregateBLSSignatures([]Signature{sig, oppositeSig}) diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index a7eaad5a2a4..2f05ed72c42 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -399,10 +399,10 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat return nil, notEnoughSharesErrorf("number of signature shares %d is not enough, %d are required", len(s.shares), s.threshold+1) } - thresholdSignature := make([]byte, signatureLengthBLSBLS12381) + thresholdSignature := make([]byte, SignatureLenBLSBLS12381) // prepare the C layer inputs - shares := make([]byte, 0, len(s.shares)*signatureLengthBLSBLS12381) + shares := make([]byte, 0, len(s.shares)*SignatureLenBLSBLS12381) signers := make([]index, 0, len(s.shares)) for index, share := range s.shares { shares = append(shares, share...) @@ -482,7 +482,7 @@ func BLSReconstructThresholdSignature(size int, threshold int, m := make(map[index]bool) // flatten the shares (required by the C layer) - flatShares := make([]byte, 0, signatureLengthBLSBLS12381*(threshold+1)) + flatShares := make([]byte, 0, SignatureLenBLSBLS12381*(threshold+1)) indexSigners := make([]index, 0, threshold+1) for i, share := range shares { flatShares = append(flatShares, share...) @@ -500,7 +500,7 @@ func BLSReconstructThresholdSignature(size int, threshold int, indexSigners = append(indexSigners, index(signers[i])+1) } - thresholdSignature := make([]byte, signatureLengthBLSBLS12381) + thresholdSignature := make([]byte, SignatureLenBLSBLS12381) // Lagrange Interpolate at point 0 if C.E1_lagrange_interpolate_at_zero_write( (*C.uchar)(&thresholdSignature[0]), diff --git a/crypto/spock.go b/crypto/spock.go index 8180b9b72bd..da269c23ac1 100644 --- a/crypto/spock.go +++ b/crypto/spock.go @@ -73,7 +73,7 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur return false, notBLSKeyError } - if len(proof1) != signatureLengthBLSBLS12381 || len(proof2) != signatureLengthBLSBLS12381 { + if len(proof1) != g1BytesLen || len(proof2) != g1BytesLen { return false, nil } From a88897291207f75d455f82e8af45dd863e444198 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 29 May 2023 15:23:51 -0600 Subject: [PATCH 122/200] cgo supports macros! use C constant macros in go --- crypto/bls.go | 8 +++----- crypto/bls12381_utils.c | 33 +-------------------------------- crypto/bls12381_utils.go | 33 +++++++++++++++------------------ crypto/bls12381_utils.h | 9 --------- 4 files changed, 19 insertions(+), 64 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index f515a9445dc..9930db2a3b4 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -59,13 +59,11 @@ const ( // Cipher suite used for BLS PoP of the form : BLS_POP_ || h2cSuiteID || SchemeTag_ // The PoP cipher suite is guaranteed to be different than all signature ciphersuites blsPOPCipherSuite = "BLS_POP_" + h2cSuiteID + schemeTag + // expandMsgOutput is the output length of the expand_message step as required by the + // hash_to_curve algorithm (and the map to G1 step). + expandMsgOutput = int(C.MAP_TO_G1_INPUT_LEN) ) -// expandMsgOutput is the output length of the expand_message step as required by the -// hash_to_curve algorithm (and the map to G1 step). -// (Cgo does not export C macros) -var expandMsgOutput = int(C.get_mapToG1_input_len()) - // blsBLS12381Algo, embeds SignAlgo type blsBLS12381Algo struct { // the signing algo and parameters diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 00d32ccdfed..5edbb92b78e 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -9,37 +9,6 @@ // compile all blst C src along with this file #include "blst_src.c" -// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) - -// return macro values to the upper Go Layer -int get_valid() { - return VALID; -} - -int get_invalid() { - return INVALID; -} - -int get_Fr_BYTES() { - return Fr_BYTES; -} - -int get_Fp_BYTES() { - return Fp_BYTES; -} - -int get_G1_SER_BYTES() { - return G1_SER_BYTES; -} - -int get_G2_SER_BYTES() { - return G2_SER_BYTES; -} - -int get_mapToG1_input_len() { - return MAP_TO_G1_INPUT_LEN; -} - // ------------------- Fr utilities // Montgomery constant R related to the curve order r @@ -47,7 +16,7 @@ int get_mapToG1_input_len() { const Fr BLS12_381_rR = {{ \ TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \ TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \ - }}; +}}; // returns true if a == 0 and false otherwise bool Fr_is_zero(const Fr* a) { diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 0cc7e75a509..75b9385d3ab 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -44,24 +44,21 @@ type scalar C.Fr // TODO: For now scalars are represented as field elements Fr since all scalars // are less than r - check if distinguishing two types in necessary -// BLS12-381 related lengths, exported as functions -// because cgo does not recognize C macros. -var frBytesLen = int(C.get_Fr_BYTES()) -var g1BytesLen = int(C.get_G1_SER_BYTES()) -var g2BytesLen = int(C.get_G2_SER_BYTES()) -var fpBytesLen = int(C.get_Fp_BYTES()) - -// get some constants from the C layer -// (Cgo does not export C macros) -var valid = C.get_valid() -var invalid = C.get_invalid() - -// get some constants from the C layer -// var blst_errors = C.blst_get_errors() -var blst_valid = (int)(C.BLST_SUCCESS) -var blst_bad_encoding = (int)(C.BLST_BAD_ENCODING) -var blst_bad_scalar = (int)(C.BLST_BAD_SCALAR) -var blst_point_not_on_curve = (int)(C.BLST_POINT_NOT_ON_CURVE) +const ( + // BLS12-381 related lengths imported from the C layer + frBytesLen = int(C.Fr_BYTES) + g1BytesLen = int(C.G1_SER_BYTES) + g2BytesLen = int(C.G2_SER_BYTES) + fpBytesLen = int(C.Fp_BYTES) + + // more internal constants from the C layer + valid = C.VALID + invalid = C.INVALID + blst_valid = int(C.BLST_SUCCESS) + blst_bad_encoding = int(C.BLST_BAD_ENCODING) + blst_bad_scalar = int(C.BLST_BAD_SCALAR) + blst_point_not_on_curve = int(C.BLST_POINT_NOT_ON_CURVE) +) func (a *scalar) String() string { encoding := make([]byte, frBytesLen) diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index f3800e6ebef..48a7b1476de 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -42,15 +42,6 @@ typedef _Bool bool; // assuming cgo is using a modern enough compiler #define G1_SER_BYTES (G1_BYTES/(G1_SERIALIZATION+1)) #define G2_SER_BYTES (G2_BYTES/(G2_SERIALIZATION+1)) - -int get_valid(); -int get_invalid(); -int get_Fr_BYTES(); -int get_Fp_BYTES(); -int get_G1_SER_BYTES(); -int get_G2_SER_BYTES(); -int get_mapToG1_input_len(); - // BLS based SPoCK int bls_spock_verify(const E2*, const byte*, const E2*, const byte*); From 945f2b9fb3feca4bb5aeeb5efcbfeffd6411647f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 29 May 2023 19:16:32 -0600 Subject: [PATCH 123/200] define new internal ERROR type to abstract BLST_ERROR --- crypto/bls.go | 4 +- crypto/bls12381_utils.c | 270 +++++++++++++------------------- crypto/bls12381_utils.go | 49 +++--- crypto/bls12381_utils.h | 61 ++++---- crypto/bls_core.c | 58 ++++++- crypto/bls_include.h | 6 +- crypto/bls_multisig.go | 7 - crypto/bls_thresholdsign_core.c | 4 +- crypto/blst_include.h | 5 - crypto/dkg_core.c | 6 +- crypto/dkg_feldmanvss.go | 2 +- crypto/dkg_include.h | 2 +- 12 files changed, 231 insertions(+), 243 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 9930db2a3b4..93dd487a817 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -39,16 +39,14 @@ import ( "github.com/onflow/flow-go/crypto/hash" ) -var ( +const ( // SignatureLenBLSBLS12381 is the size of a `G_1` element. SignatureLenBLSBLS12381 = g1BytesLen // PubKeyLenBLSBLS12381 is the size of a `G_2` element. PubKeyLenBLSBLS12381 = g2BytesLen // PrKeyLenBLSBLS12381 is the size of a `F_r` element, where `r` is the order of `G_1` and `G_2`. PrKeyLenBLSBLS12381 = frBytesLen -) -const ( // Hash to curve params // hash to curve suite ID of the form : CurveID_ || HashID_ || MapID_ || encodingVariant_ h2cSuiteID = "BLS12381G1_XOF:KMAC128_SSWU_RO_" diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 5edbb92b78e..35bf1ff4686 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -172,40 +172,40 @@ static void pow256_from_Fr(pow256 ret, const Fr* in) { // reads a scalar in `a` and checks it is a valid Fr element (a < r). // input is bytes-big-endian. // returns: -// - BLST_BAD_ENCODING if the length is invalid -// - BLST_BAD_SCALAR if the scalar isn't in Fr -// - BLST_SUCCESS if the scalar is valid -BLST_ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) { +// - BAD_ENCODING if the length is invalid +// - BAD_VALUE if the scalar isn't in Fr +// - VALID if the scalar is valid +ERROR Fr_read_bytes(Fr* a, const byte *bin, int len) { if (len != Fr_BYTES) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } pow256 tmp; // compare to r using the provided tool from BLST pow256_from_be_bytes(tmp, bin); // TODO: check endianness!! if (!check_mod_256(tmp, BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256! - return BLST_BAD_SCALAR; + return BAD_VALUE; } vec_zero(tmp, sizeof(tmp)); limbs_from_be_bytes((limb_t*)a, bin, Fr_BYTES); // TODO: check endianness!! - return BLST_SUCCESS; + return VALID; } // reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r). // input bytes are big endian. // returns: -// - BLST_BAD_ENCODING if the length is invalid -// - BLST_BAD_SCALAR if the scalar isn't in Fr_star -// - BLST_SUCCESS if the scalar is valid -BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) { +// - BAD_ENCODING if the length is invalid +// - BAD_VALUE if the scalar isn't in Fr_star +// - VALID if the scalar is valid +ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len) { int ret = Fr_read_bytes(a, bin, len); - if (ret != BLST_SUCCESS) { + if (ret != VALID) { return ret; } // check if a=0 if (Fr_is_zero(a)) { - return BLST_BAD_SCALAR; + return BAD_VALUE; } - return BLST_SUCCESS; + return VALID; } // write Fr element `a` in big endian bytes. @@ -329,19 +329,19 @@ void Fp_from_montg(Fp *res, const Fp *a) { // reads a scalar in `a` and checks it is a valid Fp element (a < p). // input is bytes-big-endian. // returns: -// - BLST_BAD_ENCODING if the length is invalid -// - BLST_BAD_SCALAR if the scalar isn't in Fp -// - BLST_SUCCESS if the scalar is valid -BLST_ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) { +// - BAD_ENCODING if the length is invalid +// - BAD_VALUE if the scalar isn't in Fp +// - VALID if the scalar is valid +ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) { if (len != Fp_BYTES) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } limbs_from_be_bytes((limb_t*)a, bin, Fp_BYTES); // compare read scalar to p if (!Fp_check(a)) { - return BLST_BAD_ENCODING; + return BAD_VALUE; } - return BLST_SUCCESS; + return VALID; } @@ -413,22 +413,22 @@ static byte Fp2_get_sign(Fp2* y) { // input is a serialization of real(a) concatenated to serializetion of imag(a). // a[i] are both Fp elements. // returns: -// - BLST_BAD_ENCODING if the length is invalid -// - BLST_BAD_SCALAR if the scalar isn't in Fp -// - BLST_SUCCESS if the scalar is valid -static BLST_ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) { +// - BAD_ENCODING if the length is invalid +// - BAD_VALUE if the scalar isn't in Fp +// - VALID if the scalar is valid +static ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) { if (len != Fp2_BYTES) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } - BLST_ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES); - if (ret != BLST_SUCCESS) { + ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES); + if (ret != VALID) { return ret; } ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES); - if ( ret != BLST_SUCCESS) { + if ( ret != VALID) { return ret; } - return BLST_SUCCESS; + return VALID; } // write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes. @@ -494,23 +494,23 @@ bool E1_in_G1(const E1* p){ // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) // // returns: -// - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid -// - BLST_BAD_SCALAR if Fp coordinates couldn't deserialize -// - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E1 -// - BLST_SUCCESS if deserialization is valid +// - BAD_ENCODING if the length is invalid or serialization header bits are invalid +// - BAD_VALUE if Fp coordinates couldn't deserialize +// - POINT_NOT_ON_CURVE if deserialized point isn't on E1 +// - VALID if deserialization is valid // TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, // and update logic with G2 subgroup check? -BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { +ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { // check the length if (len != G1_SER_BYTES) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } // check the compression bit int compressed = bin[0] >> 7; if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } // check if the point in infinity @@ -518,29 +518,29 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { if (is_infinity) { // the remaining bits need to be cleared if (bin[0] & 0x3F) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } for (int i=1; i> 5) & 1; if (y_sign && (!compressed)) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } // use a temporary buffer to mask the header bits and read a.x byte temp[Fp_BYTES]; memcpy(temp, bin, Fp_BYTES); temp[0] &= 0x1F; // clear the header bits - BLST_ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp)); - if (ret != BLST_SUCCESS) { + ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp)); + if (ret != VALID) { return ret; } @@ -549,14 +549,14 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { if (G1_SERIALIZATION == UNCOMPRESSED) { ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y)); - if (ret != BLST_SUCCESS){ + if (ret != VALID){ return ret; } // check read point is on curve if (!E1_affine_on_curve(a)) { - return BLST_POINT_NOT_ON_CURVE; + return POINT_NOT_ON_CURVE; } - return BLST_SUCCESS; + return VALID; } // compute the possible square root @@ -565,13 +565,13 @@ BLST_ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in Montg form if (!Fp_sqrt_montg(&a->y, &a->y)) // check whether x^3+b is a quadratic residue - return BLST_POINT_NOT_ON_CURVE; + return POINT_NOT_ON_CURVE; // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) if (Fp_get_sign(&a->y) != y_sign) { Fp_neg(&a->y, &a->y); // flip y sign if needed } - return BLST_SUCCESS; + return VALID; } // E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or uncompressed form. @@ -649,7 +649,7 @@ int E1_sum_vector_byte(byte* dest, const byte* sigs_bytes, const int sigs_len) { // import the points from the array for (int i=0; i < n; i++) { // deserialize each point from the input array - if (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES) != BLST_SUCCESS) { + if (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES) != VALID) { error = INVALID; goto out; } @@ -706,9 +706,46 @@ int map_to_G1(E1* h, const byte* hash, const int len) { return VALID; } +// maps the bytes to a point in G1. +// `len` should be at least Fr_BYTES. +// this is a testing file only, should not be used in any protocol! +void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) { + assert(len >= Fr_BYTES); + // map to Fr + Fr log; + map_bytes_to_Fr(&log, bytes, len); + // multiplies G1 generator by a random scalar + G1_mult_gen(p, &log); +} + +// generates a point in E1\G1 and stores it in p +// this is a testing file only, should not be used in any protocol! +ERROR unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) { + assert(G1_SERIALIZATION == COMPRESSED); + assert(len >= G1_SER_BYTES); + + // attempt to deserilize a compressed E1 point from input bytes + // after fixing the header 2 bits + byte copy[G1_SER_BYTES]; + memcpy(copy, bytes, sizeof(copy)); + copy[0] |= 1<<7; // set compression bit + copy[0] &= ~(1<<6); // clear infinity bit - point is not infinity + + ERROR ser = E1_read_bytes(p, copy, G1_SER_BYTES); + if (ser != VALID) { + return ser; + } + + // map the point to E2\G2 by clearing G2 order + E1_mult(p, p, (const Fr*)BLS12_381_r); + E1_to_affine(p, p); + + assert(E1_affine_on_curve(p)); // sanity check to make sure p is in E2 + return VALID; +} + // ------------------- E2 utilities -const E1* BLS12_381_g1 = (const E1*)&BLS12_381_G1; /// TODO:delete const E2* BLS12_381_g2 = (const E2*)&BLS12_381_G2; const E2* BLS12_381_minus_g2 = (const E2*)&BLS12_381_NEG_G2; @@ -716,23 +753,23 @@ const E2* BLS12_381_minus_g2 = (const E2*)&BLS12_381_NEG_G2; // The resulting point is guaranteed to be on curve E2 (no G2 check is included). // // returns: -// - BLST_BAD_ENCODING if the length is invalid or serialization header bits are invalid -// - BLST_BAD_SCALAR if Fp^2 coordinates couldn't deserialize -// - BLST_POINT_NOT_ON_CURVE if deserialized point isn't on E2 -// - BLST_SUCCESS if deserialization is valid +// - BAD_ENCODING if the length is invalid or serialization header bits are invalid +// - BAD_VALUE if Fp^2 coordinates couldn't deserialize +// - POINT_NOT_ON_CURVE if deserialized point isn't on E2 +// - VALID if deserialization is valid // TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, // and update logic with G2 subgroup check? -BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { +ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { // check the length if (len != G2_SER_BYTES) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } // check the compression bit int compressed = bin[0] >> 7; if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } // check if the point in infinity @@ -740,29 +777,29 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { if (is_infinity) { // the remaining bits need to be cleared if (bin[0] & 0x3F) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } for (int i=1; i> 5) & 1; if (y_sign && (!compressed)) { - return BLST_BAD_ENCODING; + return BAD_ENCODING; } // use a temporary buffer to mask the header bits and read a.x byte temp[Fp2_BYTES]; memcpy(temp, bin, Fp2_BYTES); temp[0] &= 0x1F; // clear the header bits - BLST_ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp)); - if (ret != BLST_SUCCESS) { + ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp)); + if (ret != VALID) { return ret; } @@ -773,14 +810,14 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { if (G2_SERIALIZATION == UNCOMPRESSED) { ret = Fp2_read_bytes(&(a->y), bin + Fp2_BYTES, sizeof(a->y)); - if (ret != BLST_SUCCESS){ + if (ret != VALID){ return ret; } // check read point is on curve if (!E2_affine_on_curve(a)) { - return BLST_POINT_NOT_ON_CURVE; + return POINT_NOT_ON_CURVE; } - return BLST_SUCCESS; + return VALID; } // compute the possible square root @@ -793,13 +830,13 @@ BLST_ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { Fp2_mul_montg(a_y, a_y, a_x); Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form if (!Fp2_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue - return BLST_POINT_NOT_ON_CURVE; + return POINT_NOT_ON_CURVE; // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) if (Fp2_get_sign(a_y) != y_sign) { Fp2_neg(a_y, a_y); // flip y sign if needed } - return BLST_SUCCESS; + return VALID; } // E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or uncompressed form. @@ -929,52 +966,6 @@ void E2_sum_vector(E2* sum, const E2* y, const int len){ } } -// ------------------- other - - -// Verifies the validity of 2 SPoCK proofs and 2 public keys. -// Membership check in G1 of both proofs is verified in this function. -// Membership check in G2 of both keys is not verified in this function. -// the membership check in G2 is separated to allow optimizing multiple verifications -// using the same public keys. -int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) { - E1 elemsG1[2]; - E2 elemsG2[2]; - - // elemsG1[0] = s1 - if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != BLST_SUCCESS) { - return INVALID; - }; - // check s1 is in G1 - if (!E1_in_G1(&elemsG1[0])) { - return INVALID; - } - - // elemsG1[1] = s2 - if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != BLST_SUCCESS) { - return INVALID; - }; - // check s2 is in G1 - if (!E1_in_G1(&elemsG1[1])) { - return INVALID; - } - - // elemsG2[1] = pk1 - E2_copy(&elemsG2[1], pk1); - - // elemsG2[0] = -pk2 - E2_neg(&elemsG2[0], pk2); - - // double pairing - Fp12 e; - multi_pairing(&e, elemsG1 , elemsG2, 2); - - if (Fp12_is_one(&e)) { - return VALID; - } - return INVALID; -} - // Subtracts all G2 array elements `y` from an element `x` and writes the // result in res void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){ @@ -983,45 +974,6 @@ void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len){ E2_add(res, x, res); } - -// maps the bytes to a point in G1. -// `len` should be at least Fr_BYTES. -// this is a testing file only, should not be used in any protocol! -void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) { - assert(len >= Fr_BYTES); - // map to Fr - Fr log; - map_bytes_to_Fr(&log, bytes, len); - // multiplies G1 generator by a random scalar - G1_mult_gen(p, &log); -} - -// generates a point in E1\G1 and stores it in p -// this is a testing file only, should not be used in any protocol! -BLST_ERROR unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) { - assert(G1_SERIALIZATION == COMPRESSED); - assert(len >= G1_SER_BYTES); - - // attempt to deserilize a compressed E1 point from input bytes - // after fixing the header 2 bits - byte copy[G1_SER_BYTES]; - memcpy(copy, bytes, sizeof(copy)); - copy[0] |= 1<<7; // set compression bit - copy[0] &= ~(1<<6); // clear infinity bit - point is not infinity - - BLST_ERROR ser = E1_read_bytes(p, copy, G1_SER_BYTES); - if (ser != BLST_SUCCESS) { - return ser; - } - - // map the point to E2\G2 by clearing G2 order - E1_mult(p, p, (const Fr*)BLS12_381_r); - E1_to_affine(p, p); - - assert(E1_affine_on_curve(p)); // sanity check to make sure p is in E2 - return BLST_SUCCESS; -} - // maps the bytes to a point in G2. // `len` should be at least Fr_BYTES. // this is a testing tool only, it should not be used in any protocol! @@ -1035,11 +987,11 @@ void unsafe_map_bytes_to_G2(E2* p, const byte* bytes, int len) { } // attempts to map `bytes` to a point in E2\G2 and stores it in p. -// `len` should be at least G2_SER_BYTES. It returns BLST_SUCCESS only if mapping +// `len` should be at least G2_SER_BYTES. It returns VALID only if mapping // succeeds. // For now, function only works when E2 serialization is compressed. // this is a testing tool only, it should not be used in any protocol! -BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { +ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { assert(G2_SERIALIZATION == COMPRESSED); assert(len >= G2_SER_BYTES); @@ -1050,8 +1002,8 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { copy[0] |= 1<<7; // set compression bit copy[0] &= ~(1<<6); // clear infinity bit - point is not infinity - BLST_ERROR ser = E2_read_bytes(p, copy, G2_SER_BYTES); - if (ser != BLST_SUCCESS) { + ERROR ser = E2_read_bytes(p, copy, G2_SER_BYTES); + if (ser != VALID) { return ser; } @@ -1060,7 +1012,7 @@ BLST_ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { E2_to_affine(p, p); assert(E2_affine_on_curve(p)); // sanity check to make sure p is in E2 - return BLST_SUCCESS; + return VALID; } // ------------------- Pairing utilities @@ -1079,7 +1031,7 @@ void Fp12_set_one(Fp12 *a) { // It assumes `p` and `q` are correctly initialized and all // p[i] and q[i] are respectively on G1 and G2 (it does not // check their memberships). -void multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) { +void Fp12_multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) { // easier access pointer vec384fp6* res_vec = (vec384fp6*)res; // N_MAX is defined within BLST. It should represent a good tradeoff of the max number diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 75b9385d3ab..812319ced63 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -41,23 +41,23 @@ type pointE1 C.E1 type pointE2 C.E2 type scalar C.Fr -// TODO: For now scalars are represented as field elements Fr since all scalars -// are less than r - check if distinguishing two types in necessary +// Note that scalars and field elements F_r are represented in Go by the same type +// called `scalar`, which is internally represented by C type `Fr`. Scalars used by the +// Go layer are all reduced modulo the curve order `r`. const ( // BLS12-381 related lengths imported from the C layer frBytesLen = int(C.Fr_BYTES) + fpBytesLen = int(C.Fp_BYTES) g1BytesLen = int(C.G1_SER_BYTES) g2BytesLen = int(C.G2_SER_BYTES) - fpBytesLen = int(C.Fp_BYTES) - // more internal constants from the C layer - valid = C.VALID - invalid = C.INVALID - blst_valid = int(C.BLST_SUCCESS) - blst_bad_encoding = int(C.BLST_BAD_ENCODING) - blst_bad_scalar = int(C.BLST_BAD_SCALAR) - blst_point_not_on_curve = int(C.BLST_POINT_NOT_ON_CURVE) + // error constants imported from the C layer + valid = C.VALID + invalid = C.INVALID + badEncoding = C.BAD_ENCODING + badValue = C.BAD_VALUE + pointNotOnCurve = C.POINT_NOT_ON_CURVE ) func (a *scalar) String() string { @@ -173,18 +173,17 @@ func readScalarFrStar(a *scalar, src []byte) error { (*C.uchar)(&src[0]), (C.int)(len(src))) - switch int(read) { - case blst_valid: + switch read { + case valid: return nil - case blst_bad_encoding: + case badEncoding: return invalidInputsErrorf("input length must be %d, got %d", frBytesLen, len(src)) - case blst_bad_scalar: + case badValue: return invalidInputsErrorf("scalar is not in the correct range w.r.t the BLS12-381 curve") default: return invalidInputsErrorf("reading the scalar failed") } - } // readPointE2 reads a E2 point from a slice of bytes @@ -196,12 +195,12 @@ func readPointE2(a *pointE2, src []byte) error { (*C.uchar)(&src[0]), (C.int)(len(src))) - switch int(read) { - case blst_valid: + switch read { + case valid: return nil - case blst_bad_encoding, blst_bad_scalar: + case badEncoding, badValue: return invalidInputsErrorf("input could not deserialize to a E2 point") - case blst_point_not_on_curve: + case pointNotOnCurve: return invalidInputsErrorf("input is not a point on curve E2") default: return errors.New("reading E2 point failed") @@ -217,12 +216,12 @@ func readPointE1(a *pointE1, src []byte) error { (*C.uchar)(&src[0]), (C.int)(len(src))) - switch int(read) { - case blst_valid: + switch read { + case valid: return nil - case blst_bad_encoding, blst_bad_scalar: + case badEncoding, badValue: return invalidInputsErrorf("input could not deserialize to a E1 point") - case blst_point_not_on_curve: + case pointNotOnCurve: return invalidInputsErrorf("input is not a point on curve E1") default: return errors.New("reading E1 point failed") @@ -263,7 +262,7 @@ func unsafeMapToG1(pt *pointE1, seed []byte) { // It generates a random point in E2\G2 and stores it in input point. func unsafeMapToG1Complement(pt *pointE1, seed []byte) bool { res := C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) - return int(res) == blst_valid + return int(res) == valid } // unsafeMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files. @@ -277,7 +276,7 @@ func unsafeMapToG2(pt *pointE2, seed []byte) { // It generates a random point in E2\G2 and stores it in input point. func unsafeMapToG2Complement(pt *pointE2, seed []byte) bool { res := C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) - return int(res) == blst_valid + return int(res) == valid } // This is only a TEST function. diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 48a7b1476de..921df90624d 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -11,10 +11,18 @@ typedef uint8_t byte; typedef _Bool bool; // assuming cgo is using a modern enough compiler +// minimum targeted security level #define SEC_BITS 128 -#define VALID 0 -#define INVALID 1 -#define UNDEFINED (((VALID&1)^1) | ((INVALID&2)^2)) // different value than RLC_OK and RLC_ERR + +typedef enum { + VALID = 0, + INVALID, + BAD_ENCODING, + BAD_VALUE, + POINT_NOT_ON_CURVE, + POINT_NOT_IN_GROUP, + UNDEFINED, +} ERROR; #define BITS_TO_BYTES(x) ((x+7)>>3) #define BITS_TO_LIMBS(x) ((x+63)>>6) @@ -42,13 +50,10 @@ typedef _Bool bool; // assuming cgo is using a modern enough compiler #define G1_SER_BYTES (G1_BYTES/(G1_SERIALIZATION+1)) #define G2_SER_BYTES (G2_BYTES/(G2_SERIALIZATION+1)) -// BLS based SPoCK -int bls_spock_verify(const E2*, const byte*, const E2*, const byte*); - // Fr utilities extern const Fr BLS12_381_rR; -bool Fr_is_zero(const Fr* a); -bool Fr_is_equal(const Fr* a, const Fr* b); +bool Fr_is_zero(const Fr* a); +bool Fr_is_equal(const Fr* a, const Fr* b); void Fr_set_limb(Fr*, const limb_t); void Fr_copy(Fr*, const Fr*); void Fr_set_zero(Fr*); @@ -63,10 +68,10 @@ void Fr_from_montg(Fr *res, const Fr *a); void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len); void Fr_inv_montg_eucl(Fr *res, const Fr *a); void Fr_inv_exp_montg(Fr *res, const Fr *a); -BLST_ERROR Fr_read_bytes(Fr* a, const byte *bin, int len); -BLST_ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len); +ERROR Fr_read_bytes(Fr* a, const byte *bin, int len); +ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len); void Fr_write_bytes(byte *bin, const Fr* a); -bool map_bytes_to_Fr(Fr*, const byte*, int); +bool map_bytes_to_Fr(Fr*, const byte*, int); // Fp utilities void Fp_mul_montg(Fp *, const Fp *, const Fp *); @@ -74,34 +79,34 @@ void Fp_squ_montg(Fp *, const Fp *); // E1 and G1 utilities void E1_copy(E1*, const E1*); -bool E1_is_equal(const E1*, const E1*); +bool E1_is_equal(const E1*, const E1*); void E1_set_infty(E1*); -bool E1_is_infty(const E1*); +bool E1_is_infty(const E1*); void E1_to_affine(E1*, const E1*); -bool E1_affine_on_curve(const E1*); -bool E1_in_G1(const E1*); +bool E1_affine_on_curve(const E1*); +bool E1_in_G1(const E1*); void E1_mult(E1*, const E1*, const Fr*); void E1_add(E1*, const E1*, const E1*); void E1_neg(E1*, const E1*); void E1_sum_vector(E1*, const E1*, const int); int E1_sum_vector_byte(byte*, const byte*, const int); void G1_mult_gen(E1*, const Fr*); -BLST_ERROR E1_read_bytes(E1*, const byte *, const int); +ERROR E1_read_bytes(E1*, const byte *, const int); void E1_write_bytes(byte *, const E1*); void unsafe_map_bytes_to_G1(E1*, const byte*, int); -BLST_ERROR unsafe_map_bytes_to_G1complement(E1*, const byte*, int); -// hash to curve functions (functions in bls12381_hashtocurve.c) -#define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8)) -int map_to_G1(E1*, const byte*, const int); +ERROR unsafe_map_bytes_to_G1complement(E1*, const byte*, int); + +#define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8)) +int map_to_G1(E1*, const byte*, const int); // functions in bls12381_hashtocurve.c // E2 and G2 utilities void E2_set_infty(E2* p); -bool E2_is_infty(const E2*); -bool E2_affine_on_curve(const E2*); -bool E2_is_equal(const E2*, const E2*); +bool E2_is_infty(const E2*); +bool E2_affine_on_curve(const E2*); +bool E2_is_equal(const E2*, const E2*); void E2_copy(E2*, const E2*); void E2_to_affine(E2*, const E2*); -BLST_ERROR E2_read_bytes(E2*, const byte *, const int); +ERROR E2_read_bytes(E2*, const byte *, const int); void E2_write_bytes(byte *, const E2*); void G2_mult_gen(E2*, const Fr*); void E2_mult(E2*, const E2*, const Fr*); @@ -110,14 +115,14 @@ void E2_add(E2* res, const E2* a, const E2* b); void E2_neg(E2*, const E2*); void E2_sum_vector(E2*, const E2*, const int); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); -bool E2_in_G2(const E2*); +bool E2_in_G2(const E2*); void unsafe_map_bytes_to_G2(E2*, const byte*, int); -BLST_ERROR unsafe_map_bytes_to_G2complement(E2*, const byte*, int); +ERROR unsafe_map_bytes_to_G2complement(E2*, const byte*, int); // pairing and Fp12 -bool Fp12_is_one(Fp12*); +bool Fp12_is_one(Fp12*); void Fp12_set_one(Fp12*); -void multi_pairing(Fp12*, const E1*, const E2*, const int); +void Fp12_multi_pairing(Fp12*, const E1*, const E2*, const int); // utility testing function void xmd_sha256(byte *, int, byte *, int, byte *, int); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index e1578a150fe..0771269ed86 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -47,7 +47,7 @@ static int bls_verify_E1(const E2* pk, const E1* s, const E1* h) { // double pairing Fp12 e; - multi_pairing(&e, elemsG1, elemsG2, 2); + Fp12_multi_pairing(&e, elemsG1, elemsG2, 2); if (Fp12_is_one(&e)) { return VALID; } @@ -80,7 +80,7 @@ int bls_verifyPerDistinctMessage(const byte* sig, if (!elemsG2) goto outG2; // elemsG1[0] = sig - if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != BLST_SUCCESS) { + if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { ret = INVALID; goto out; } @@ -113,7 +113,7 @@ int bls_verifyPerDistinctMessage(const byte* sig, // multi pairing Fp12 e; - multi_pairing(&e, elemsG1 , elemsG2, nb_hashes+1); + Fp12_multi_pairing(&e, elemsG1 , elemsG2, nb_hashes+1); if (Fp12_is_one(&e)) { ret = VALID; } else { @@ -154,7 +154,7 @@ int bls_verifyPerDistinctKey(const byte* sig, if (!elemsG2) goto outG2; // elemsG1[0] = s - if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != BLST_SUCCESS) { + if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { ret = INVALID; goto out; } @@ -206,7 +206,7 @@ int bls_verifyPerDistinctKey(const byte* sig, // multi pairing Fp12 e; - multi_pairing(&e, elemsG1, elemsG2, nb_pks+1); + Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_pks+1); if (Fp12_is_one(&e)) { ret = VALID; @@ -230,7 +230,7 @@ int bls_verifyPerDistinctKey(const byte* sig, int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int hash_len) { E1 s, h; // deserialize the signature into a curve point - if (E1_read_bytes(&s, sig, G1_SER_BYTES) != BLST_SUCCESS) { + if (E1_read_bytes(&s, sig, G1_SER_BYTES) != VALID) { return INVALID; } @@ -381,7 +381,7 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, // - valid points are multiplied by a random scalar (same for public keys at same index) // to make sure a signature at index (i) is verified against the public key at the same index. int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES); - if (read_ret != BLST_SUCCESS || !E1_in_G1(&sigs[i])) { + if (read_ret != VALID || !E1_in_G1(&sigs[i])) { // set signature and key to infinity (no effect on the aggregation tree) // and set result to invalid (result won't be overwritten) E2_set_infty(&pks[i]); @@ -420,3 +420,47 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, out_sigs: free(pks); } + +// Verifies the validity of 2 SPoCK proofs and 2 public keys. +// Membership check in G1 of both proofs is verified in this function. +// Membership check in G2 of both keys is not verified in this function. +// the membership check in G2 is separated to allow optimizing multiple verifications +// using the same public keys. +int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) { + E1 elemsG1[2]; + E2 elemsG2[2]; + + // elemsG1[0] = s1 + if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != VALID) { + return INVALID; + }; + // check s1 is in G1 + if (!E1_in_G1(&elemsG1[0])) { + return INVALID; + } + + // elemsG1[1] = s2 + if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != VALID) { + return INVALID; + }; + // check s2 is in G1 + if (!E1_in_G1(&elemsG1[1])) { + return INVALID; + } + + // elemsG2[1] = pk1 + E2_copy(&elemsG2[1], pk1); + + // elemsG2[0] = -pk2 + E2_neg(&elemsG2[0], pk2); + + // double pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1 , elemsG2, 2); + + if (Fp12_is_one(&e)) { + return VALID; + } + return INVALID; +} + diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 1ca61b376c4..2cbf91b2936 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -5,8 +5,7 @@ #include "bls12381_utils.h" - -// bls core (functions in bls_core.c) +// BLS signature core (functions in bls_core.c) int bls_sign(byte*, const Fr*, const byte*, const int); int bls_verify(const E2*, const byte*, const byte*, const int); int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, @@ -17,4 +16,7 @@ int bls_verifyPerDistinctKey(const byte*, void bls_batch_verify(const int, byte*, const E2*, const byte*, const byte*, const int, const byte*); +// BLS based SPoCK +int bls_spock_verify(const E2*, const byte*, const E2*, const byte*); + #endif diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index 6c99ae461e2..fdb21a986f1 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -5,12 +5,7 @@ import ( "errors" "fmt" - _ "errors" - - _ "fmt" - "github.com/onflow/flow-go/crypto/hash" - _ "github.com/onflow/flow-go/crypto/hash" ) // BLS multi-signature using BLS12-381 curve @@ -95,7 +90,6 @@ func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) { // - (nil, error) if an unexpected error occurs // - (aggregated_signature, nil) otherwise func AggregateBLSSignatures(sigs []Signature) (Signature, error) { - // check for empty list if len(sigs) == 0 { return nil, blsAggregateEmptyListError @@ -140,7 +134,6 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) { // - (nil, blsAggregateEmptyListError) if no keys are provided (input slice is empty) // - (aggregated_key, nil) otherwise func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { - // check for empty list if len(keys) == 0 { return nil, blsAggregateEmptyListError diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index 78a87823b4c..e951cc9c33f 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -22,7 +22,7 @@ static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const byte indices[] // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately 64/MAX_IND_BITS) // this means we can multiply up to (k) indices in a limb (64 bits) without overflowing. - #define MAX_IND_LOOPS 64/MAX_IND_BITS + #define MAX_IND_LOOPS (64/MAX_IND_BITS) const int loops = MAX_IND_LOOPS; int k,j = 0; Fr tmp; @@ -88,7 +88,7 @@ int E1_lagrange_interpolate_at_zero_write(byte* dest, const byte* shares, const E1* E1_shares = malloc(sizeof(E1) * len); for (int i=0; i < len; i++) { read_ret = E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES); - if (read_ret != BLST_SUCCESS) { + if (read_ret != VALID) { goto out; } } diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 20c2fcad5df..e3c0bb9701a 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -5,11 +5,6 @@ #include "point.h" #include "fields.h" #include "consts.h" -#include "errors.h" - -// TODO: add sanity checks that BLST_PK_IS_INFINITY is indeed the last -// enum value (eventually submit a fix to BLST) -#define BLST_BAD_SCALAR ((BLST_PK_IS_INFINITY)+1) // types used by the Flow crypto library that are imported from BLST // these type definitions are used as an abstraction from BLST internal types diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 9966fbcfc37..15e8e0c48b3 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -63,16 +63,16 @@ void G2_vector_write_bytes(byte* out, const E2* A, const int len) { // The function imports an array of E2 points from a concatenated array of bytes. // The bytes array is supposed to be in (len * G2_SER_BYTES) -BLST_ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){ +ERROR E2_vector_read_bytes(E2* A, const byte* src, const int len){ byte* p = (byte*) src; for (int i=0; i Date: Tue, 30 May 2023 17:22:21 -0600 Subject: [PATCH 124/200] update code base to work for G1 serialization defined as uncompressed --- crypto/bls.go | 13 +++------- crypto/bls12381_utils.c | 40 +++++++++++------------------- crypto/bls12381_utils.go | 35 +++++++++++++++++++++++--- crypto/bls12381_utils.h | 15 +++++------ crypto/bls12381_utils_test.go | 27 ++++++++------------ crypto/bls_include.h | 4 +-- crypto/bls_multisig.go | 2 +- crypto/bls_test.go | 28 ++++++++++++++------- crypto/bls_thresholdsign_include.h | 4 +-- crypto/dkg_include.h | 4 +-- crypto/sign.go | 7 +++--- crypto/spock_test.go | 4 +-- 12 files changed, 99 insertions(+), 84 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 93dd487a817..7f884a73c49 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -207,11 +207,6 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) } } -// 0xC0 is the header of the point at infinity serialization (either in G1 or G2) -const infinityPointHeader = byte(0xC0) - -var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, SignatureLenBLSBLS12381-1)...) - // IsBLSSignatureIdentity checks whether the input signature is // the identity signature (point at infinity in G1). // @@ -221,7 +216,7 @@ var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, Sign // suspected to be equal to identity, which avoids failing the aggregated // signature verification. func IsBLSSignatureIdentity(s Signature) bool { - return bytes.Equal(s, identityBLSSignature) + return bytes.Equal(s, g1Serialization) } // generatePrivateKey deterministically generates a private key for BLS on BLS12-381 curve. @@ -347,8 +342,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err // decodePublicKeyCompressed decodes a slice of bytes into a public key. // since we use the compressed representation by default, this checks the default and delegates to decodePublicKeyCompressed func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (PublicKey, error) { - // in compression mode, g2BytesLen is equal to 2 * Fp_bytes - if g2BytesLen != 2*fpBytesLen { + if !isG2Compressed() { panic("library is not configured to use compressed public key serialization") } return a.decodePublicKey(publicKeyBytes) @@ -478,8 +472,7 @@ func (pk *pubKeyBLSBLS12381) Size() int { // The encoding is a compressed encoding of the point // [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte { - // in compression mode, g2BytesLen is equal to 2 * Fp_bytes - if g2BytesLen != 2*fpBytesLen { + if !isG2Compressed() { panic("library is not configured to use compressed public key serialization") } dest := make([]byte, g2BytesLen) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 35bf1ff4686..30f8b862aa0 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -543,6 +543,7 @@ ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { if (ret != VALID) { return ret; } + Fp_to_montg(&a->x, &a->x); // set a.z to 1 Fp_copy(&a->z, &BLS12_381_pR); @@ -552,6 +553,7 @@ ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { if (ret != VALID){ return ret; } + Fp_to_montg(&a->y, &a->y); // check read point is on curve if (!E1_affine_on_curve(a)) { return POINT_NOT_ON_CURVE; @@ -560,12 +562,12 @@ ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { } // compute the possible square root - Fp_to_montg(&a->x, &a->x); Fp_squ_montg(&a->y, &a->x); Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in Montg form - if (!Fp_sqrt_montg(&a->y, &a->y)) // check whether x^3+b is a quadratic residue + if (!Fp_sqrt_montg(&a->y, &a->y)) { // check whether x^3+b is a quadratic residue return POINT_NOT_ON_CURVE; + } // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) if (Fp_get_sign(&a->y) != y_sign) { @@ -718,30 +720,18 @@ void unsafe_map_bytes_to_G1(E1* p, const byte* bytes, int len) { G1_mult_gen(p, &log); } -// generates a point in E1\G1 and stores it in p +// maps bytes to a point in E1\G1. +// `len` must be at least 96 bytes. // this is a testing file only, should not be used in any protocol! -ERROR unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) { - assert(G1_SERIALIZATION == COMPRESSED); - assert(len >= G1_SER_BYTES); - - // attempt to deserilize a compressed E1 point from input bytes - // after fixing the header 2 bits - byte copy[G1_SER_BYTES]; - memcpy(copy, bytes, sizeof(copy)); - copy[0] |= 1<<7; // set compression bit - copy[0] &= ~(1<<6); // clear infinity bit - point is not infinity - - ERROR ser = E1_read_bytes(p, copy, G1_SER_BYTES); - if (ser != VALID) { - return ser; - } - - // map the point to E2\G2 by clearing G2 order - E1_mult(p, p, (const Fr*)BLS12_381_r); - E1_to_affine(p, p); - - assert(E1_affine_on_curve(p)); // sanity check to make sure p is in E2 - return VALID; +void unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) { + assert(len >= 96); + Fp u; + map_96_bytes_to_Fp(&u, bytes, 96); + // map to E1's isogenous and then to E1 + map_to_isogenous_E1((POINTonE1 *)p, u); + isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p); + // clear G1 order + E1_mult(p, p, (Fr*)&BLS12_381_r); } // ------------------- E2 utilities diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 812319ced63..b9535d39955 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -60,6 +60,28 @@ const ( pointNotOnCurve = C.POINT_NOT_ON_CURVE ) +// header of the point at infinity serializations +var g1SerHeader byte // g1 +var g2SerHeader byte // g2 + +// `g1“ serialization +var g1Serialization []byte + +// initialization of BLS12-381 curve +func initBLS12381() { + if isG1Compressed() { + g1SerHeader = 0xC0 + } else { + g1SerHeader = 0x40 + } + g1Serialization = append([]byte{g1SerHeader}, make([]byte, g1BytesLen-1)...) + if isG2Compressed() { + g2SerHeader = 0xC0 + } else { + g2SerHeader = 0x40 + } +} + func (a *scalar) String() string { encoding := make([]byte, frBytesLen) writeScalar(encoding, a) @@ -260,9 +282,8 @@ func unsafeMapToG1(pt *pointE1, seed []byte) { // unsafeMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. -func unsafeMapToG1Complement(pt *pointE1, seed []byte) bool { - res := C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) - return int(res) == valid +func unsafeMapToG1Complement(pt *pointE1, seed []byte) { + C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } // unsafeMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files. @@ -307,3 +328,11 @@ func hashToG1Bytes(data, dst []byte) []byte { writePointE1(pointBytes, &point) return pointBytes } + +func isG1Compressed() bool { + return g1BytesLen == fpBytesLen +} + +func isG2Compressed() bool { + return g2BytesLen == 2*fpBytesLen +} diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 921df90624d..ccbb4c9655c 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -2,8 +2,8 @@ // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -#ifndef _REL_MISC_INCLUDE_H -#define _REL_MISC_INCLUDE_H +#ifndef _BLS12_381_UTILS_H +#define _BLS12_381_UTILS_H #include #include "blst_include.h" @@ -43,8 +43,8 @@ typedef enum { #define G2_BYTES (2*Fp2_BYTES) // Compressed and uncompressed points -#define COMPRESSED 1 -#define UNCOMPRESSED 0 +#define COMPRESSED 1 +#define UNCOMPRESSED 0 #define G1_SERIALIZATION (COMPRESSED) #define G2_SERIALIZATION (COMPRESSED) #define G1_SER_BYTES (G1_BYTES/(G1_SERIALIZATION+1)) @@ -94,7 +94,7 @@ void G1_mult_gen(E1*, const Fr*); ERROR E1_read_bytes(E1*, const byte *, const int); void E1_write_bytes(byte *, const E1*); void unsafe_map_bytes_to_G1(E1*, const byte*, int); -ERROR unsafe_map_bytes_to_G1complement(E1*, const byte*, int); +void unsafe_map_bytes_to_G1complement(E1*, const byte*, int); #define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8)) int map_to_G1(E1*, const byte*, const int); // functions in bls12381_hashtocurve.c @@ -130,6 +130,7 @@ void xmd_sha256(byte *, int, byte *, int, byte *, int); // Debugging related functions #define DEBUG 0 #if (DEBUG == 1) +#include void bytes_print_(char*, byte*, int); void Fr_print_(char*, Fr*); void Fp_print_(char*, const Fp*); @@ -137,6 +138,6 @@ void Fp2_print_(char*, const Fp2*); void Fp12_print_(char*, const Fp12*); void E1_print_(char*, const E1*, const int); void E2_print_(char*, const E2*, const int); -#endif // DEBUG +#endif /* DEBUG */ -#endif \ No newline at end of file +#endif /* BLS12_381_UTILS */ \ No newline at end of file diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 2c9d76bbbe5..7741238278e 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -139,12 +139,7 @@ func TestSubgroupCheck(t *testing.T) { unsafeMapToG1(&p, seed) // point in G1 assert.True(t, checkMembershipG1(&p)) - inG1 := false - for !inG1 { - _, err := prg.Read(seed) - require.NoError(t, err) - inG1 = unsafeMapToG1Complement(&p, seed) // point in E2\G2 - } + unsafeMapToG1Complement(&p, seed) // point in E2\G2 assert.False(t, checkMembershipG1(&p)) }) @@ -198,8 +193,8 @@ func TestReadWriteG1(t *testing.T) { bytes := make([]byte, g1BytesLen) // generate a random G1 point, encode it, decode it, // and compare it the original point - iterations := 50 t.Run("random points", func(t *testing.T) { + iterations := 50 for i := 0; i < iterations; i++ { var p, q pointE1 _, err := prg.Read(seed) @@ -213,16 +208,14 @@ func TestReadWriteG1(t *testing.T) { }) t.Run("infinity", func(t *testing.T) { - for i := 0; i < iterations; i++ { - var p, q pointE1 - seed := make([]byte, frBytesLen) - unsafeMapToG1(&p, seed) // this results in the infinity point - writePointE1(bytes, &p) - require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check - err := readPointE1(&q, bytes) - require.NoError(t, err) - assert.True(t, p.equals(&q)) - } + var p, q pointE1 + seed := make([]byte, frBytesLen) + unsafeMapToG1(&p, seed) // this results in the infinity point + writePointE1(bytes, &p) + require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check + err := readPointE1(&q, bytes) + require.NoError(t, err) + assert.True(t, p.equals(&q)) }) } diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 2cbf91b2936..c5dba4d45de 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -1,7 +1,7 @@ // this file is about the core functions required by the BLS signature scheme -#ifndef _REL_BLS_INCLUDE_H -#define _REL_BLS_INCLUDE_H +#ifndef _BLS_INCLUDE_H +#define _BLS_INCLUDE_H #include "bls12381_utils.h" diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index fdb21a986f1..7adbb0c1f45 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -507,7 +507,7 @@ func BatchVerifyBLSSignaturesOneMessage( // However, the boolean return for index `i` is set to `false` and won't be overwritten. returnBool[i] = false pkPoints = append(pkPoints, getIdentityPoint()) - flatSigs = append(flatSigs, identityBLSSignature...) + flatSigs = append(flatSigs, g1Serialization...) } else { pkPoints = append(pkPoints, pkBLS.point) flatSigs = append(flatSigs, sigs[i]...) diff --git a/crypto/bls_test.go b/crypto/bls_test.go index d8561ccc5f6..7ea369a5b73 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -29,6 +29,9 @@ func TestBLSMainMethods(t *testing.T) { // - signature decoding only accepts reduced x-coordinates to avoid signature malleability t.Run("invalid x coordinate larger than p", func(t *testing.T) { + if !isG1Compressed() { + t.Skip() + } msg, err := hex.DecodeString("7f26ba692dc2da7ff828ef4675ff1cd6ab855fca0637b6dab295f1df8e51bc8bb1b8f0c6610aabd486cf1f098f2ddbc6691d94e10f928816f890a3d366ce46249836a595c7ea1828af52e899ba2ab627ab667113bb563918c5d5a787c414399487b4e3a7") require.NoError(t, err) validSig, err := hex.DecodeString("80b0cac2a0f4f8881913edf2b29065675dfed6f6f4e17e9b5d860a845d4e7d476b277d06a493b81482e63d8131f9f2fa") @@ -190,7 +193,7 @@ func TestBLSEncodeDecode(t *testing.T) { t.Run("infinity public key", func(t *testing.T) { // decode an identity public key pkBytes := make([]byte, PubKeyLenBLSBLS12381) - pkBytes[0] = infinityPointHeader + pkBytes[0] = g2SerHeader pk, err := DecodePublicKey(BLSBLS12381, pkBytes) require.NoError(t, err, "decoding identity public key should succeed") assert.True(t, pk.Equals(IdentityBLSPublicKey())) @@ -543,12 +546,15 @@ func TestBLSAggregatePublicKeys(t *testing.T) { assert.True(t, blsKey.isIdentity) // check of encoding header pkBytes := aggPK.Encode() - assert.Equal(t, infinityPointHeader, pkBytes[0]) + assert.Equal(t, g2SerHeader, pkBytes[0]) }) t.Run("Identity public key from opposite points", func(t *testing.T) { + if !isG2Compressed() { + t.Skip() + } pkBytes := pks[0].Encode() - negatePoint(pkBytes) + negateCompressedPoint(pkBytes) minusPk, err := DecodePublicKey(BLSBLS12381, pkBytes) require.NoError(t, err) // aggregated public keys @@ -561,7 +567,7 @@ func TestBLSAggregatePublicKeys(t *testing.T) { assert.True(t, blsKey.isIdentity) // check of encoding header pkBytes = aggPK.Encode() - assert.Equal(t, infinityPointHeader, pkBytes[0]) + assert.Equal(t, g2SerHeader, pkBytes[0]) }) } @@ -822,9 +828,9 @@ func TestBLSBatchVerify(t *testing.T) { } // Utility function that flips a point sign bit to negate the point -// this is shortcut which works only for zcash BLS12-381 compressed serialization -// Applicable to both signatures and public keys -func negatePoint(pointbytes []byte) { +// this is shortcut which works only for zcash BLS12-381 compressed serialization. +// Applicable to both signatures and public keys. +func negateCompressedPoint(pointbytes []byte) { pointbytes[0] ^= 0x20 } @@ -1190,10 +1196,14 @@ func TestBLSIdentity(t *testing.T) { hasher := NewExpandMsgXOFKMAC128("") t.Run("identity signature comparison", func(t *testing.T) { + if !isG1Compressed() { + t.Skip() + } // verify that constructed identity signatures are recognized as such by IsBLSSignatureIdentity. // construct identity signature by summing (aggregating) a random signature and its inverse. - assert.True(t, IsBLSSignatureIdentity(identityBLSSignature)) + // sanity check to start + assert.True(t, IsBLSSignatureIdentity(g1Serialization)) // sum up a random signature and its inverse to get identity sk := randomSK(t, rand) @@ -1201,7 +1211,7 @@ func TestBLSIdentity(t *testing.T) { require.NoError(t, err) oppositeSig := make([]byte, SignatureLenBLSBLS12381) copy(oppositeSig, sig) - negatePoint(oppositeSig) + negateCompressedPoint(oppositeSig) aggSig, err := AggregateBLSSignatures([]Signature{sig, oppositeSig}) require.NoError(t, err) assert.True(t, IsBLSSignatureIdentity(aggSig)) diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h index 3937f8ce965..7c27c3b97b8 100644 --- a/crypto/bls_thresholdsign_include.h +++ b/crypto/bls_thresholdsign_include.h @@ -1,5 +1,5 @@ -#ifndef _REL_THRESHOLD_INCLUDE_H -#define _REL_THRESHOLD_INCLUDE_H +#ifndef _THRESHOLD_INCLUDE_H +#define _THRESHOLD_INCLUDE_H #include "bls_include.h" diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index c361d3ce861..7cd2b8b7d2d 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -1,5 +1,5 @@ -#ifndef _REL_DKG_INCLUDE_H -#define _REL_DKG_INCLUDE_H +#ifndef _DKG_INCLUDE_H +#define _DKG_INCLUDE_H #include "bls12381_utils.h" diff --git a/crypto/sign.go b/crypto/sign.go index ff4348f3b09..d400898d97d 100644 --- a/crypto/sign.go +++ b/crypto/sign.go @@ -65,19 +65,18 @@ func newSigner(algo SigningAlgorithm) (signer, error) { // Initialize the context of all algos func init() { - // P-256 + // ECDSA p256Instance = &(ecdsaAlgo{ curve: elliptic.P256(), algo: ECDSAP256, }) - - // secp256k1 secp256k1Instance = &(ecdsaAlgo{ curve: btcec.S256(), algo: ECDSASecp256k1, }) - // bls12-381 + // BLS + initBLS12381() blsInstance = &blsBLS12381Algo{ algo: BLSBLS12381, } diff --git a/crypto/spock_test.go b/crypto/spock_test.go index 75de3dea838..59498a42f6f 100644 --- a/crypto/spock_test.go +++ b/crypto/spock_test.go @@ -69,7 +69,7 @@ func TestSPOCKProveVerifyAgainstData(t *testing.T) { t.Run("identity proof", func(t *testing.T) { // verifying with a pair of (proof, publicKey) equal to (identity_signature, identity_key) should // return false - identityProof := identityBLSSignature + identityProof := g1Serialization result, err := SPOCKVerifyAgainstData(IdentityBLSPublicKey(), identityProof, data, kmac) assert.NoError(t, err) assert.False(t, result) @@ -166,7 +166,7 @@ func TestSPOCKProveVerify(t *testing.T) { t.Run("identity proof", func(t *testing.T) { // verifying with either pair of (proof, publicKey) equal to (identity_signature, identity_key) should // return falsen with any other (proof, key) pair. - identityProof := identityBLSSignature + identityProof := g1Serialization result, err := SPOCKVerify(IdentityBLSPublicKey(), identityProof, sk2.PublicKey(), pr2) assert.NoError(t, err) assert.False(t, result) From 9ed47f1975689db2a296c6c22da0e446d7a0158c Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 30 May 2023 18:23:51 -0600 Subject: [PATCH 125/200] update code base to work for G2 serialization defined as uncompressed --- crypto/bls.go | 16 +++++----- crypto/bls12381_utils.c | 55 +++++++++++++---------------------- crypto/bls12381_utils.go | 5 ++-- crypto/bls12381_utils.h | 2 +- crypto/bls12381_utils_test.go | 18 +++++++----- crypto/bls_test.go | 5 +++- crypto/sign_test_utils.go | 26 +++++++++-------- 7 files changed, 62 insertions(+), 65 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index 7f884a73c49..447a203033b 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -286,7 +286,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { const invalidBLSSignatureHeader = byte(0xE0) // BLSInvalidSignature returns an invalid signature that fails when verified -// with any message and public key. +// with any message and public key, which can be used for testing. // // The signature bytes represent an invalid serialization of a point which // makes the verification fail early. The verification would return (false, nil). @@ -475,15 +475,17 @@ func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte { if !isG2Compressed() { panic("library is not configured to use compressed public key serialization") } - dest := make([]byte, g2BytesLen) - writePointE2(dest, &a.point) - return dest + return a.Encode() } -// Encode returns a byte encoding of the public key. -// Since we use a compressed encoding by default, this delegates to EncodeCompressed +// Encode returns a byte encoding of the public key (a G2 point). +// The current encoding is a compressed serialization of G2 following [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- +// +// The function should evolve in the future to support uncompressed compresion too. func (a *pubKeyBLSBLS12381) Encode() []byte { - return a.EncodeCompressed() + dest := make([]byte, g2BytesLen) + writePointE2(dest, &a.point) + return dest } // Equals checks is two public keys are equal diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 30f8b862aa0..d88bfa3aaa8 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -792,17 +792,23 @@ ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { if (ret != VALID) { return ret; } + Fp2* a_x = &(a->x); + Fp_to_montg(&real(a_x), &real(a_x)); + Fp_to_montg(&imag(a_x), &imag(a_x)); // set a.z to 1 Fp2* a_z = &(a->z); Fp_copy(&real(a_z), &BLS12_381_pR); Fp_set_zero(&imag(a_z)); + Fp2* a_y = &(a->y); if (G2_SERIALIZATION == UNCOMPRESSED) { - ret = Fp2_read_bytes(&(a->y), bin + Fp2_BYTES, sizeof(a->y)); + ret = Fp2_read_bytes(a_y, bin + Fp2_BYTES, sizeof(a->y)); if (ret != VALID){ return ret; } + Fp_to_montg(&real(a_y), &real(a_y)); + Fp_to_montg(&imag(a_y), &imag(a_y)); // check read point is on curve if (!E2_affine_on_curve(a)) { return POINT_NOT_ON_CURVE; @@ -811,14 +817,9 @@ ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { } // compute the possible square root - Fp2* a_x = &(a->x); - Fp_to_montg(&real(a_x), &real(a_x)); - Fp_to_montg(&imag(a_x), &imag(a_x)); - - Fp2* a_y = &(a->y); Fp2_squ_montg(a_y, a_x); - Fp2_mul_montg(a_y, a_y, a_x); - Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form + Fp2_mul_montg(a_y, a_y, a_x); // x^3 + Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form if (!Fp2_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue return POINT_NOT_ON_CURVE; @@ -976,33 +977,19 @@ void unsafe_map_bytes_to_G2(E2* p, const byte* bytes, int len) { G2_mult_gen(p, &log); } -// attempts to map `bytes` to a point in E2\G2 and stores it in p. -// `len` should be at least G2_SER_BYTES. It returns VALID only if mapping -// succeeds. -// For now, function only works when E2 serialization is compressed. +// maps `bytes` to a point in E2\G2 and stores it in p. +// `len` should be at least 192. // this is a testing tool only, it should not be used in any protocol! -ERROR unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { - assert(G2_SERIALIZATION == COMPRESSED); - assert(len >= G2_SER_BYTES); - - // attempt to deserilize a compressed E2 point from input bytes - // after fixing the header 2 bits - byte copy[G2_SER_BYTES]; - memcpy(copy, bytes, sizeof(copy)); - copy[0] |= 1<<7; // set compression bit - copy[0] &= ~(1<<6); // clear infinity bit - point is not infinity - - ERROR ser = E2_read_bytes(p, copy, G2_SER_BYTES); - if (ser != VALID) { - return ser; - } - - // map the point to E2\G2 by clearing G2 order - E2_mult(p, p, (const Fr*)BLS12_381_r); - E2_to_affine(p, p); - - assert(E2_affine_on_curve(p)); // sanity check to make sure p is in E2 - return VALID; +void unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { + assert(len >= 192); + Fp2 u; + map_96_bytes_to_Fp(&real(&u), bytes, 96); + map_96_bytes_to_Fp(&imag(&u), bytes+96, 96); + // map to E2's isogenous and then to E2 + map_to_isogenous_E2((POINTonE2 *)p, u); + isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p); + // clear G2 order + E2_mult(p, p, (Fr*)&BLS12_381_r); } // ------------------- Pairing utilities diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index b9535d39955..87a515f3b31 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -295,9 +295,8 @@ func unsafeMapToG2(pt *pointE2, seed []byte) { // unsafeMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. -func unsafeMapToG2Complement(pt *pointE2, seed []byte) bool { - res := C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) - return int(res) == valid +func unsafeMapToG2Complement(pt *pointE2, seed []byte) { + C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } // This is only a TEST function. diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index ccbb4c9655c..b9c8ab755a7 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -117,7 +117,7 @@ void E2_sum_vector(E2*, const E2*, const int); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); bool E2_in_G2(const E2*); void unsafe_map_bytes_to_G2(E2*, const byte*, int); -ERROR unsafe_map_bytes_to_G2complement(E2*, const byte*, int); +void unsafe_map_bytes_to_G2complement(E2*, const byte*, int); // pairing and Fp12 bool Fp12_is_one(Fp12*); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 7741238278e..067ac979f7e 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -22,6 +22,9 @@ func TestScalarMultBLS12381(t *testing.T) { // Note that generator and random point multiplications // are implemented with the same algorithm t.Run("G1", func(t *testing.T) { + if !isG1Compressed() { + t.Skip() + } var p pointE1 generatorScalarMultG1(&p, &expo) expected, err := hex.DecodeString("96484ca50719f5d2533047960878b6bae8289646c0f00a942a1e6992be9981a9e0c7a51e9918f9b19d178cf04a8018a4") @@ -35,6 +38,9 @@ func TestScalarMultBLS12381(t *testing.T) { // Note that generator and random point multiplications // are implemented with the same algorithm t.Run("G2", func(t *testing.T) { + if !isG2Compressed() { + t.Skip() + } var p pointE2 generatorScalarMultG2(&p, &expo) expected, err := hex.DecodeString("b35f5043f166848805b98da62dcb9c5d2f25e497bd0d9c461d4a00d19e4e67cc1e813de3c99479d5a2c62fb754fd7df40c4fd60c46834c8ae665343a3ff7dc3cc929de34ad62b7b55974f4e3fd20990d3e564b96e4d33de87716052d58cf823e") @@ -81,6 +87,9 @@ func BenchmarkScalarMult(b *testing.B) { // Sanity-check of the map-to-G1 with regards to the IETF draft hash-to-curve func TestMapToG1(t *testing.T) { + if !isG1Compressed() { + t.Skip() + } // test vectors from https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#appendix-J.9.1 dst := []byte("QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_RO_") @@ -130,7 +139,7 @@ func BenchmarkMapToG1(b *testing.B) { // test subgroup membership check in G1 and G2 func TestSubgroupCheck(t *testing.T) { prg := getPRG(t) - seed := make([]byte, g2BytesLen) + seed := make([]byte, 192) _, err := prg.Read(seed) require.NoError(t, err) @@ -148,12 +157,7 @@ func TestSubgroupCheck(t *testing.T) { unsafeMapToG2(&p, seed) // point in G2 assert.True(t, checkMembershipG2(&p)) - inG2 := false - for !inG2 { - _, err := prg.Read(seed) - require.NoError(t, err) - inG2 = unsafeMapToG2Complement(&p, seed) // point in E2\G2 - } + unsafeMapToG2Complement(&p, seed) // point in E2\G2 assert.False(t, checkMembershipG2(&p)) }) } diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 7ea369a5b73..4047967be9b 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -29,7 +29,7 @@ func TestBLSMainMethods(t *testing.T) { // - signature decoding only accepts reduced x-coordinates to avoid signature malleability t.Run("invalid x coordinate larger than p", func(t *testing.T) { - if !isG1Compressed() { + if !isG1Compressed() || !isG2Compressed() { t.Skip() } msg, err := hex.DecodeString("7f26ba692dc2da7ff828ef4675ff1cd6ab855fca0637b6dab295f1df8e51bc8bb1b8f0c6610aabd486cf1f098f2ddbc6691d94e10f928816f890a3d366ce46249836a595c7ea1828af52e899ba2ab627ab667113bb563918c5d5a787c414399487b4e3a7") @@ -221,6 +221,9 @@ func TestBLSEncodeDecode(t *testing.T) { // may implicitely rely on the property. t.Run("public key with non-reduced coordinates", func(t *testing.T) { + if !isG2Compressed() { + t.Skip() + } // valid pk with x[0] < p and x[1] < p validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") require.NoError(t, err) diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go index 8362df83c7f..06179a01989 100644 --- a/crypto/sign_test_utils.go +++ b/crypto/sign_test_utils.go @@ -5,7 +5,6 @@ import ( "fmt" mrand "math/rand" "testing" - "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -14,7 +13,7 @@ import ( ) func getPRG(t *testing.T) *mrand.Rand { - random := time.Now().UnixNano() + random := int64(1685491239186156000) //time.Now().UnixNano() t.Logf("rng seed is %d", random) rng := mrand.New(mrand.NewSource(random)) return rng @@ -186,13 +185,13 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { skCheckBytes := skCheck.Encode() assert.Equal(t, skBytes, skCheckBytes, "keys should be equal") distinctSkBytes := distinctSk.Encode() - assert.NotEqual(t, skBytes, distinctSkBytes, "keys should be different") + assert.NotEqual(t, skBytes, distinctSkBytes) // check public key encoding pk := sk.PublicKey() pkBytes := pk.Encode() pkCheck, err := DecodePublicKey(salg, pkBytes) - require.Nil(t, err, "the key decoding failed") + require.Nil(t, err) assert.True(t, pk.Equals(pkCheck), "key equality check failed") pkCheckBytes := pkCheck.Encode() assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal") @@ -200,14 +199,17 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different") // same for the compressed encoding - pkComprBytes := pk.EncodeCompressed() - pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, pk.Equals(pkComprCheck), "key equality check failed") - pkCheckComprBytes := pkComprCheck.EncodeCompressed() - assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal") - distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed() - assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different") + // skip is BLS is used and compression isn't supported + if !(salg == BLSBLS12381 && !isG2Compressed()) { + pkComprBytes := pk.EncodeCompressed() + pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes) + require.Nil(t, err, "the key decoding failed") + assert.True(t, pk.Equals(pkComprCheck), "key equality check failed") + pkCheckComprBytes := pkComprCheck.EncodeCompressed() + assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal") + distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed() + assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different") + } } }) From 01b64c560343ef605c7d4737c802c2b7ac2b2ac7 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 30 May 2023 21:34:18 -0600 Subject: [PATCH 126/200] make sure older compilers recognize uintx_t --- crypto/bls12381_utils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index b9c8ab755a7..d35e0298c59 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -6,6 +6,7 @@ #define _BLS12_381_UTILS_H #include +#include #include "blst_include.h" typedef uint8_t byte; From 7b0a25ec8190b67202b9a4180a13363350e31937 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 30 May 2023 21:40:31 -0600 Subject: [PATCH 127/200] update crypto/Makefile go command --- crypto/Makefile | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/crypto/Makefile b/crypto/Makefile index a75e00df15b..3fa010ca6ae 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -10,20 +10,24 @@ else RACE_FLAG := endif +# the crypto package uses BLST source files underneath which may use ADX insructions ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +ifeq ($(ADX_SUPPORT), 1) +# if ADX insructions are supported, default is to use a fast ADX BLST implementation + CGO_FLAG := +else +# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation + CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__" +endif # test all packages .PHONY: test test: -# root package (it uses BLST source files underneath which requires testing for ADX support) -ifeq ($(ADX_SUPPORT), 1) - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) -else - CGO_CFLAGS="-O -D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) -endif +# root package + $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) # sub packages - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random + $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash + $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random .PHONY: docker-build docker-build: From 00f66d6c2f17ab095358f1420c9b8c5b089902c9 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 30 May 2023 21:43:08 -0600 Subject: [PATCH 128/200] package default build uses ADX --- crypto/bls12381_utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 87a515f3b31..a01c46e05b8 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -4,7 +4,7 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -O -D__BLST_PORTABLE__ -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls12381_utils.h" From e998ab6eebc5630035c476d1d202f291da864ec6 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 31 May 2023 17:08:47 -0600 Subject: [PATCH 129/200] add ADX detection and cgo flags for all go commands in Makefile/Dockerfile --- Makefile | 82 +++++++++++++-------- cmd/Dockerfile | 6 +- crypto/Makefile | 12 ++- insecure/Makefile | 20 ++++- integration/Makefile | 42 ++++++++--- integration/benchmark/cmd/manual/Dockerfile | 5 +- 6 files changed, 117 insertions(+), 50 deletions(-) diff --git a/Makefile b/Makefile index cd402f40f1e..8fcb8fa3ecb 100644 --- a/Makefile +++ b/Makefile @@ -42,11 +42,29 @@ K8S_YAMLS_LOCATION_STAGING=./k8s/staging export CONTAINER_REGISTRY := gcr.io/flow-container-registry export DOCKER_BUILDKIT := 1 +# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. +ifeq ($(shell uname -s),Linux) +# detect ADX support on the CURRENT linux machine. + ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +else +# on non-linux machines, set the flag to 1 by default + ADX_SUPPORT := 1 +endif + +# the crypto package uses BLST source files underneath which may use ADX insructions. +ifeq ($(ADX_SUPPORT), 1) +# if ADX insructions are supported, default is to use a fast ADX BLST implementation + CGO_FLAG := +else +# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation + CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__" +endif + cmd/collection/collection: - go build -o cmd/collection/collection cmd/collection/main.go + $(CGO_FLAG) go build -o cmd/collection/collection cmd/collection/main.go cmd/util/util: - go build -o cmd/util/util cmd/util/main.go + $(CGO_FLAG) go build -o cmd/util/util cmd/util/main.go .PHONY: update-core-contracts-version update-core-contracts-version: @@ -58,7 +76,7 @@ update-core-contracts-version: .PHONY: unittest-main unittest-main: # test all packages - go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(GO_TEST_PACKAGES) + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(GO_TEST_PACKAGES) .PHONY: install-mock-generators install-mock-generators: @@ -83,7 +101,7 @@ verify-mocks: generate-mocks .PHONY: fuzz-fvm fuzz-fvm: # run fuzz tests in the fvm package - cd ./fvm && go test -fuzz=Fuzz -run ^$$ + cd ./fvm && $(CGO_FLAG) go test -fuzz=Fuzz -run ^$$ .PHONY: test test: verify-mocks unittest-main @@ -121,7 +139,7 @@ generate-proto: .PHONY: generate-fvm-env-wrappers generate-fvm-env-wrappers: - go run ./fvm/environment/generate-wrappers fvm/environment/parse_restricted_checker.go + $(CGO_FLAG) go run ./fvm/environment/generate-wrappers fvm/environment/parse_restricted_checker.go .PHONY: generate-mocks generate-mocks: install-mock-generators @@ -241,59 +259,59 @@ docker-ci-integration: .PHONY: docker-build-collection docker-build-collection: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/collection:latest" -t "$(CONTAINER_REGISTRY)/collection:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/collection:$(FLOW_GO_TAG)" . .PHONY: docker-build-collection-without-netgo docker-build-collection-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-collection-debug docker-build-collection-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/collection-debug:latest" -t "$(CONTAINER_REGISTRY)/collection-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection-debug:$(IMAGE_TAG)" . .PHONY: docker-build-consensus docker-build-consensus: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/consensus:latest" -t "$(CONTAINER_REGISTRY)/consensus:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/consensus:$(FLOW_GO_TAG)" . .PHONY: docker-build-consensus-without-netgo docker-build-consensus-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-consensus-debug docker-build-consensus-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/consensus-debug:latest" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(IMAGE_TAG)" . .PHONY: docker-build-execution docker-build-execution: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/execution:latest" -t "$(CONTAINER_REGISTRY)/execution:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/execution:$(FLOW_GO_TAG)" . .PHONY: docker-build-execution-without-netgo docker-build-execution-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-execution-debug docker-build-execution-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/execution-debug:latest" -t "$(CONTAINER_REGISTRY)/execution-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-debug:$(IMAGE_TAG)" . # build corrupt execution node for BFT testing @@ -301,28 +319,28 @@ docker-build-execution-debug: docker-build-execution-corrupt: # temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/execution-corrupted:latest" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-verification docker-build-verification: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/verification:latest" -t "$(CONTAINER_REGISTRY)/verification:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/verification:$(FLOW_GO_TAG)" . .PHONY: docker-build-verification-without-netgo docker-build-verification-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-verification-debug docker-build-verification-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/verification-debug:latest" -t "$(CONTAINER_REGISTRY)/verification-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-debug:$(IMAGE_TAG)" . # build corrupt verification node for BFT testing @@ -330,28 +348,28 @@ docker-build-verification-debug: docker-build-verification-corrupt: # temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/verification-corrupted:latest" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-access docker-build-access: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/access:latest" -t "$(CONTAINER_REGISTRY)/access:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/access:$(FLOW_GO_TAG)" . .PHONY: docker-build-access-without-netgo docker-build-access-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-access-debug docker-build-access-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/access-debug:latest" -t "$(CONTAINER_REGISTRY)/access-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-debug:$(IMAGE_TAG)" . # build corrupt access node for BFT testing @@ -359,21 +377,21 @@ docker-build-access-debug: docker-build-access-corrupt: #temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/access-corrupted:latest" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-observer docker-build-observer: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/observer:latest" -t "$(CONTAINER_REGISTRY)/observer:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG)" . .PHONY: docker-build-observer-without-netgo docker-build-observer-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG_NO_NETGO)" . @@ -381,18 +399,18 @@ docker-build-observer-without-netgo: .PHONY: docker-build-ghost docker-build-ghost: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/ghost:latest" -t "$(CONTAINER_REGISTRY)/ghost:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost:$(IMAGE_TAG)" . .PHONY: docker-build-ghost-debug docker-build-ghost-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/ghost-debug:latest" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(IMAGE_TAG)" . PHONY: docker-build-bootstrap docker-build-bootstrap: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/bootstrap:latest" -t "$(CONTAINER_REGISTRY)/bootstrap:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap:$(IMAGE_TAG)" . @@ -402,7 +420,7 @@ tool-bootstrap: docker-build-bootstrap .PHONY: docker-build-bootstrap-transit docker-build-bootstrap-transit: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --no-cache \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --no-cache \ --target production \ -t "$(CONTAINER_REGISTRY)/bootstrap-transit:latest" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(IMAGE_TAG)" . @@ -412,7 +430,7 @@ tool-transit: docker-build-bootstrap-transit .PHONY: docker-build-loader docker-build-loader: - docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual --target production \ + docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/loader:latest" -t "$(CONTAINER_REGISTRY)/loader:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/loader:$(IMAGE_TAG)" . @@ -597,7 +615,7 @@ docker-all-tools: tool-util tool-remove-execution-fork PHONY: docker-build-util docker-build-util: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ -t "$(CONTAINER_REGISTRY)/util:latest" -t "$(CONTAINER_REGISTRY)/util:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/util:$(IMAGE_TAG)" . PHONY: tool-util @@ -606,7 +624,7 @@ tool-util: docker-build-util PHONY: docker-build-remove-execution-fork docker-build-remove-execution-fork: - docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ -t "$(CONTAINER_REGISTRY)/remove-execution-fork:latest" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(IMAGE_TAG)" . PHONY: tool-remove-execution-fork diff --git a/cmd/Dockerfile b/cmd/Dockerfile index 90075485922..d3660bd2b27 100644 --- a/cmd/Dockerfile +++ b/cmd/Dockerfile @@ -36,13 +36,15 @@ ARG GOARCH=amd64 # TAGS can be overriden to modify the go build tags (e.g. build without netgo) ARG TAGS="netgo" +# CGO_FLAG can be overwritten +ARG CGO_FLAG # Keep Go's build cache between builds. # https://github.com/golang/go/issues/27719#issuecomment-514747274 RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ --mount=type=secret,id=git_creds,dst=/root/.netrc \ - CGO_ENABLED=1 GOOS=linux go build --tags "${TAGS}" -ldflags "-extldflags -static \ + CGO_ENABLED=1 GOOS=linux ${CGO_FLAG} go build --tags "${TAGS}" -ldflags "-extldflags -static \ -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ -o ./app ${TARGET} @@ -63,7 +65,7 @@ ARG GOARCH=amd64 RUN --mount=type=ssh \ --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=1 GOOS=linux go build --tags "netgo" -ldflags "-extldflags -static \ + CGO_ENABLED=1 GOOS=linux ${CGO_FLAG} go build --tags "netgo" -ldflags "-extldflags -static \ -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ -gcflags="all=-N -l" -o ./app ${TARGET} diff --git a/crypto/Makefile b/crypto/Makefile index 3fa010ca6ae..c7361bde76b 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -10,8 +10,16 @@ else RACE_FLAG := endif -# the crypto package uses BLST source files underneath which may use ADX insructions -ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. +ifeq ($(shell uname -s),Linux) +# detect ADX support on the CURRENT linux machine. + ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +else +# on non-linux machines, set the flag to 1 by default + ADX_SUPPORT := 1 +endif + +# the crypto package uses BLST source files underneath which may use ADX insructions. ifeq ($(ADX_SUPPORT), 1) # if ADX insructions are supported, default is to use a fast ADX BLST implementation CGO_FLAG := diff --git a/insecure/Makefile b/insecure/Makefile index 9872f01b1d8..635d9a06ad7 100644 --- a/insecure/Makefile +++ b/insecure/Makefile @@ -8,7 +8,25 @@ else RACE_FLAG := endif +# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. +ifeq ($(shell uname -s),Linux) +# detect ADX support on the CURRENT linux machine. + ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +else +# on non-linux machines, set the flag to 1 by default + ADX_SUPPORT := 1 +endif + +# the crypto package uses BLST source files underneath which may use ADX insructions. +ifeq ($(ADX_SUPPORT), 1) +# if ADX insructions are supported, default is to use a fast ADX BLST implementation + CGO_FLAG := +else +# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation + CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__" +endif + # runs all unit tests of the insecure module .PHONY: test test: - go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./... diff --git a/integration/Makefile b/integration/Makefile index 7751b4ee333..b01c10d1954 100644 --- a/integration/Makefile +++ b/integration/Makefile @@ -8,6 +8,24 @@ else RACE_FLAG := endif +# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. +ifeq ($(shell uname -s),Linux) +# detect ADX support on the CURRENT linux machine. + ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +else +# on non-linux machines, set the flag to 1 by default + ADX_SUPPORT := 1 +endif + +# the crypto package uses BLST source files underneath which may use ADX insructions. +ifeq ($(ADX_SUPPORT), 1) +# if ADX insructions are supported, default is to use a fast ADX BLST implementation + CGO_FLAG := +else +# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation + CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__" +endif + # Run the integration test suite .PHONY: integration-test integration-test: access-tests ghost-tests mvp-tests execution-tests verification-tests upgrades-tests collection-tests epochs-tests network-tests consensus-tests @@ -22,53 +40,53 @@ ci-integration-test: access-tests ghost-tests mvp-tests epochs-tests consensus-t # Run unit tests for test utilities in this module .PHONY: test test: - go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests` + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests` .PHONY: access-tests access-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/... .PHONY: collection-tests collection-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/collection/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/collection/... .PHONY: consensus-tests consensus-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/consensus/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/consensus/... .PHONY: epochs-tests epochs-tests: # Use a higher timeout of 20m for the suite of tests which span full epochs - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 30m ./tests/epochs/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 30m ./tests/epochs/... .PHONY: ghost-tests ghost-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/ghost/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/ghost/... .PHONY: mvp-tests mvp-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/mvp/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/mvp/... .PHONY: execution-tests execution-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/execution/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/execution/... .PHONY: verification-tests verification-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/verification/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/verification/... .PHONY: upgrades-tests upgrades-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/... .PHONY: network-tests network-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/... # BFT tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel .PHONY: bft-tests bft-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/... -p 1 + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/... -p 1 ############################################################################################ diff --git a/integration/benchmark/cmd/manual/Dockerfile b/integration/benchmark/cmd/manual/Dockerfile index 8d474efd3dc..8ae85e43326 100644 --- a/integration/benchmark/cmd/manual/Dockerfile +++ b/integration/benchmark/cmd/manual/Dockerfile @@ -29,6 +29,9 @@ COPY . . FROM build-env as build-production WORKDIR /app +# CGO_FLAG can be overwritten +ARG CGO_FLAG + # Keep Go's build cache between builds. # https://github.com/golang/go/issues/27719#issuecomment-514747274 # Also, allow ssh access @@ -36,7 +39,7 @@ RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ --mount=type=ssh \ cd integration && \ - CGO_ENABLED=1 go build -ldflags "-extldflags -static" -o ./app ./${TARGET} + CGO_ENABLED=1 ${CGO_FLAG} go build -ldflags "-extldflags -static" -o ./app ./${TARGET} RUN mv /app/integration/app /app/app From 5dba1c5bfd924a8188864e1fba1101a7645169c1 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 31 May 2023 19:25:00 -0600 Subject: [PATCH 130/200] clarify BLST sigill message --- crypto/bls12381_utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index a01c46e05b8..72e4c010e11 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -14,7 +14,7 @@ package crypto // # include // # include // static void handler(int signum) -// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=-O -D__BLST_PORTABLE__"; +// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with "CGO_CFLAGS=-O -D__BLST_PORTABLE__"\n"; // ssize_t n = write(2, &text, strlen(text)); // _exit(128+SIGILL); // (void)n; From f471e4b9aabe3d9f0d8597b8e3818046481012e0 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 31 May 2023 20:21:57 -0600 Subject: [PATCH 131/200] fix a bug in sigill string --- crypto/bls12381_utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 72e4c010e11..fa2a6ff65de 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -14,7 +14,7 @@ package crypto // # include // # include // static void handler(int signum) -// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with "CGO_CFLAGS=-O -D__BLST_PORTABLE__"\n"; +// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with \"CGO_CFLAGS=-O -D__BLST_PORTABLE__\"\n"; // ssize_t n = write(2, &text, strlen(text)); // _exit(128+SIGILL); // (void)n; From d4b873db9e74bf66754bd3a147eb3d5b2b4b1a60 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 31 May 2023 20:23:09 -0600 Subject: [PATCH 132/200] update how cgo flag is passed to Dockerfile so that dittos aren't deleted --- Makefile | 59 +++++++++++---------- cmd/Dockerfile | 4 +- crypto/Makefile | 5 +- insecure/Makefile | 5 +- integration/Makefile | 5 +- integration/benchmark/cmd/manual/Dockerfile | 2 +- 6 files changed, 42 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 8fcb8fa3ecb..93fa3be60ba 100644 --- a/Makefile +++ b/Makefile @@ -54,11 +54,12 @@ endif # the crypto package uses BLST source files underneath which may use ADX insructions. ifeq ($(ADX_SUPPORT), 1) # if ADX insructions are supported, default is to use a fast ADX BLST implementation - CGO_FLAG := + CRYPTO_FLAG := "" else # if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation - CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__" + CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" endif +CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) cmd/collection/collection: $(CGO_FLAG) go build -o cmd/collection/collection cmd/collection/main.go @@ -259,59 +260,59 @@ docker-ci-integration: .PHONY: docker-build-collection docker-build-collection: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/collection:latest" -t "$(CONTAINER_REGISTRY)/collection:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/collection:$(FLOW_GO_TAG)" . .PHONY: docker-build-collection-without-netgo docker-build-collection-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-collection-debug docker-build-collection-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/collection-debug:latest" -t "$(CONTAINER_REGISTRY)/collection-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection-debug:$(IMAGE_TAG)" . .PHONY: docker-build-consensus docker-build-consensus: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/consensus:latest" -t "$(CONTAINER_REGISTRY)/consensus:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/consensus:$(FLOW_GO_TAG)" . .PHONY: docker-build-consensus-without-netgo docker-build-consensus-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-consensus-debug docker-build-consensus-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/consensus-debug:latest" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(IMAGE_TAG)" . .PHONY: docker-build-execution docker-build-execution: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/execution:latest" -t "$(CONTAINER_REGISTRY)/execution:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/execution:$(FLOW_GO_TAG)" . .PHONY: docker-build-execution-without-netgo docker-build-execution-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-execution-debug docker-build-execution-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/execution-debug:latest" -t "$(CONTAINER_REGISTRY)/execution-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-debug:$(IMAGE_TAG)" . # build corrupt execution node for BFT testing @@ -319,28 +320,28 @@ docker-build-execution-debug: docker-build-execution-corrupt: # temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/execution-corrupted:latest" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-verification docker-build-verification: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/verification:latest" -t "$(CONTAINER_REGISTRY)/verification:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/verification:$(FLOW_GO_TAG)" . .PHONY: docker-build-verification-without-netgo docker-build-verification-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-verification-debug docker-build-verification-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/verification-debug:latest" -t "$(CONTAINER_REGISTRY)/verification-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-debug:$(IMAGE_TAG)" . # build corrupt verification node for BFT testing @@ -348,28 +349,28 @@ docker-build-verification-debug: docker-build-verification-corrupt: # temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/verification-corrupted:latest" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-access docker-build-access: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/access:latest" -t "$(CONTAINER_REGISTRY)/access:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG)" -t "$(CONTAINER_REGISTRY)/access:$(FLOW_GO_TAG)" . .PHONY: docker-build-access-without-netgo docker-build-access-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-access-debug docker-build-access-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/access-debug:latest" -t "$(CONTAINER_REGISTRY)/access-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-debug:$(IMAGE_TAG)" . # build corrupt access node for BFT testing @@ -377,21 +378,21 @@ docker-build-access-debug: docker-build-access-corrupt: #temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/access-corrupted:latest" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-observer docker-build-observer: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/observer:latest" -t "$(CONTAINER_REGISTRY)/observer:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG)" . .PHONY: docker-build-observer-without-netgo docker-build-observer-without-netgo: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG_NO_NETGO)" . @@ -399,18 +400,18 @@ docker-build-observer-without-netgo: .PHONY: docker-build-ghost docker-build-ghost: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/ghost:latest" -t "$(CONTAINER_REGISTRY)/ghost:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost:$(IMAGE_TAG)" . .PHONY: docker-build-ghost-debug docker-build-ghost-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/ghost-debug:latest" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(IMAGE_TAG)" . PHONY: docker-build-bootstrap docker-build-bootstrap: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/bootstrap:latest" -t "$(CONTAINER_REGISTRY)/bootstrap:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap:$(IMAGE_TAG)" . @@ -420,7 +421,7 @@ tool-bootstrap: docker-build-bootstrap .PHONY: docker-build-bootstrap-transit docker-build-bootstrap-transit: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --no-cache \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --no-cache \ --target production \ -t "$(CONTAINER_REGISTRY)/bootstrap-transit:latest" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(IMAGE_TAG)" . @@ -430,7 +431,7 @@ tool-transit: docker-build-bootstrap-transit .PHONY: docker-build-loader docker-build-loader: - docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/loader:latest" -t "$(CONTAINER_REGISTRY)/loader:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/loader:$(IMAGE_TAG)" . @@ -615,7 +616,7 @@ docker-all-tools: tool-util tool-remove-execution-fork PHONY: docker-build-util docker-build-util: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ -t "$(CONTAINER_REGISTRY)/util:latest" -t "$(CONTAINER_REGISTRY)/util:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/util:$(IMAGE_TAG)" . PHONY: tool-util @@ -624,7 +625,7 @@ tool-util: docker-build-util PHONY: docker-build-remove-execution-fork docker-build-remove-execution-fork: - docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CGO_FLAG) --target production \ + docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ -t "$(CONTAINER_REGISTRY)/remove-execution-fork:latest" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(IMAGE_TAG)" . PHONY: tool-remove-execution-fork diff --git a/cmd/Dockerfile b/cmd/Dockerfile index d3660bd2b27..ade91976f7e 100644 --- a/cmd/Dockerfile +++ b/cmd/Dockerfile @@ -44,7 +44,7 @@ ARG CGO_FLAG RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ --mount=type=secret,id=git_creds,dst=/root/.netrc \ - CGO_ENABLED=1 GOOS=linux ${CGO_FLAG} go build --tags "${TAGS}" -ldflags "-extldflags -static \ + CGO_ENABLED=1 GOOS=linux CGO_FLAGS="${CGO_FLAG}" go build --tags "${TAGS}" -ldflags "-extldflags -static \ -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ -o ./app ${TARGET} @@ -65,7 +65,7 @@ ARG GOARCH=amd64 RUN --mount=type=ssh \ --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=1 GOOS=linux ${CGO_FLAG} go build --tags "netgo" -ldflags "-extldflags -static \ + CGO_ENABLED=1 GOOS=linux CGO_FLAGS="${CGO_FLAG}" go build --tags "netgo" -ldflags "-extldflags -static \ -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ -gcflags="all=-N -l" -o ./app ${TARGET} diff --git a/crypto/Makefile b/crypto/Makefile index c7361bde76b..04cc9ae19d8 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -22,11 +22,12 @@ endif # the crypto package uses BLST source files underneath which may use ADX insructions. ifeq ($(ADX_SUPPORT), 1) # if ADX insructions are supported, default is to use a fast ADX BLST implementation - CGO_FLAG := + CRYPTO_FLAG := "" else # if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation - CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__" + CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" endif +CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) # test all packages .PHONY: test diff --git a/insecure/Makefile b/insecure/Makefile index 635d9a06ad7..fd6fdae0dd9 100644 --- a/insecure/Makefile +++ b/insecure/Makefile @@ -20,11 +20,12 @@ endif # the crypto package uses BLST source files underneath which may use ADX insructions. ifeq ($(ADX_SUPPORT), 1) # if ADX insructions are supported, default is to use a fast ADX BLST implementation - CGO_FLAG := + CRYPTO_FLAG := "" else # if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation - CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__" + CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" endif +CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) # runs all unit tests of the insecure module .PHONY: test diff --git a/integration/Makefile b/integration/Makefile index b01c10d1954..2d7eb14e867 100644 --- a/integration/Makefile +++ b/integration/Makefile @@ -20,11 +20,12 @@ endif # the crypto package uses BLST source files underneath which may use ADX insructions. ifeq ($(ADX_SUPPORT), 1) # if ADX insructions are supported, default is to use a fast ADX BLST implementation - CGO_FLAG := + CRYPTO_FLAG := "" else # if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation - CGO_FLAG := CGO_CFLAGS="-O -D__BLST_PORTABLE__" + CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" endif +CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) # Run the integration test suite .PHONY: integration-test diff --git a/integration/benchmark/cmd/manual/Dockerfile b/integration/benchmark/cmd/manual/Dockerfile index 8ae85e43326..b93d44812a0 100644 --- a/integration/benchmark/cmd/manual/Dockerfile +++ b/integration/benchmark/cmd/manual/Dockerfile @@ -39,7 +39,7 @@ RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ --mount=type=ssh \ cd integration && \ - CGO_ENABLED=1 ${CGO_FLAG} go build -ldflags "-extldflags -static" -o ./app ./${TARGET} + CGO_ENABLED=1 CGO_FLAGS="${CGO_FLAG}" go build -ldflags "-extldflags -static" -o ./app ./${TARGET} RUN mv /app/integration/app /app/app From fa5177f2fce86f3c53fc65d76a19f753659fa56b Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 1 Jun 2023 14:41:01 -0600 Subject: [PATCH 133/200] add cgo flag to mockgen commands --- Makefile | 6 +++--- crypto/bls12381_utils.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 93fa3be60ba..c927ff4403a 100644 --- a/Makefile +++ b/Makefile @@ -145,9 +145,9 @@ generate-fvm-env-wrappers: .PHONY: generate-mocks generate-mocks: install-mock-generators mockery --name '(Connector|PingInfoProvider)' --dir=network/p2p --case=underscore --output="./network/mocknetwork" --outpkg="mocknetwork" - mockgen -destination=storage/mocks/storage.go -package=mocks github.com/onflow/flow-go/storage Blocks,Headers,Payloads,Collections,Commits,Events,ServiceEvents,TransactionResults - mockgen -destination=module/mocks/network.go -package=mocks github.com/onflow/flow-go/module Local,Requester - mockgen -destination=network/mocknetwork/mock_network.go -package=mocknetwork github.com/onflow/flow-go/network Network + $(CGO_FLAG) mockgen -destination=storage/mocks/storage.go -package=mocks github.com/onflow/flow-go/storage Blocks,Headers,Payloads,Collections,Commits,Events,ServiceEvents,TransactionResults + $(CGO_FLAG) mockgen -destination=module/mocks/network.go -package=mocks github.com/onflow/flow-go/module Local,Requester + $(CGO_FLAG) mockgen -destination=network/mocknetwork/mock_network.go -package=mocknetwork github.com/onflow/flow-go/network Network mockery --name='.*' --dir=integration/benchmark/mocksiface --case=underscore --output="integration/benchmark/mock" --outpkg="mock" mockery --name=ExecutionDataStore --dir=module/executiondatasync/execution_data --case=underscore --output="./module/executiondatasync/execution_data/mock" --outpkg="mock" mockery --name=Downloader --dir=module/executiondatasync/execution_data --case=underscore --output="./module/executiondatasync/execution_data/mock" --outpkg="mock" diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index fa2a6ff65de..f071b7b9f43 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -14,7 +14,7 @@ package crypto // # include // # include // static void handler(int signum) -// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with \"CGO_CFLAGS=-O -D__BLST_PORTABLE__\"\n"; +// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=\"-O -D__BLST_PORTABLE__\"\n"; // ssize_t n = write(2, &text, strlen(text)); // _exit(128+SIGILL); // (void)n; From 2a87898a7ba49eb4d1f04105ca17b4ca916fbd84 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 16 Aug 2023 20:45:21 -0600 Subject: [PATCH 134/200] remove test assertion strings in favor of PRG seed logging for test reproduction --- crypto/bls_test.go | 101 +++++++++++--------------------------- crypto/sign_test_utils.go | 49 +++++++++--------- 2 files changed, 52 insertions(+), 98 deletions(-) diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 4047967be9b..0ead9fd3100 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -75,8 +75,7 @@ func TestBLSMainMethods(t *testing.T) { // test a valid signature result, err := pk.Verify(s, input, hasher) assert.NoError(t, err) - assert.True(t, result, - "Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk) + assert.True(t, result) } }) } @@ -281,7 +280,7 @@ func TestBLSPOP(t *testing.T) { // test a valid PoP result, err := BLSVerifyPOP(pk, s) require.NoError(t, err) - assert.True(t, result, "Verification should succeed:\n signature:%s\n private key:%s", s, sk) + assert.True(t, result) // test with a valid but different key seed[0] ^= 1 @@ -289,7 +288,7 @@ func TestBLSPOP(t *testing.T) { require.NoError(t, err) result, err = BLSVerifyPOP(wrongSk.PublicKey(), s) require.NoError(t, err) - assert.False(t, result, "Verification should fail:\n signature:%s\n private key:%s", s, sk) + assert.False(t, result) } }) @@ -350,15 +349,11 @@ func TestBLSAggregateSignatures(t *testing.T) { aggSig, err := AggregateBLSSignatures(sigs) require.NoError(t, err) // First check: check the signatures are equal - assert.Equal(t, aggSig, expectedSig, - "incorrect signature %s, should be %s, private keys are %s, input is %x", - aggSig, expectedSig, sks, input) + assert.Equal(t, aggSig, expectedSig) // Second check: Verify the aggregated signature valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) require.NoError(t, err) - assert.True(t, valid, - "Verification of %s failed, signature should be %s private keys are %s, input is %x", - aggSig, expectedSig, sks, input) + assert.True(t, valid) }) // check if one signature is not correct @@ -370,15 +365,11 @@ func TestBLSAggregateSignatures(t *testing.T) { aggSig, err = AggregateBLSSignatures(sigs) require.NoError(t, err) // First check: check the signatures are not equal - assert.NotEqual(t, aggSig, expectedSig, - "signature %s shouldn't be %s private keys are %s, input is %x", - aggSig, expectedSig, sks, input) + assert.NotEqual(t, aggSig, expectedSig) // Second check: multi-verification should fail valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) require.NoError(t, err) - assert.False(t, valid, - "verification of signature %s should fail, it shouldn't be %s private keys are %s, input is %x", - aggSig, expectedSig, sks, input) + assert.False(t, valid) sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // rebuild the correct signature require.NoError(t, err) }) @@ -393,14 +384,10 @@ func TestBLSAggregateSignatures(t *testing.T) { require.NoError(t, err) expectedSig, err = aggSk.Sign(input, kmac) require.NoError(t, err) - assert.NotEqual(t, aggSig, expectedSig, - "signature %s shouldn't be %s, private keys are %s, input is %x, wrong key is of index %d", - aggSig, expectedSig, sks, input, randomIndex) + assert.NotEqual(t, aggSig, expectedSig) valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) require.NoError(t, err) - assert.False(t, valid, - "signature %s should fail, shouldn't be %s, private keys are %s, input is %x, wrong key is of index %d", - aggSig, expectedSig, sks, input, randomIndex) + assert.False(t, valid) }) t.Run("invalid inputs", func(t *testing.T) { @@ -500,9 +487,7 @@ func TestBLSAggregatePublicKeys(t *testing.T) { keys := []PublicKey{pks[0], IdentityBLSPublicKey()} aggPkWithIdentity, err := AggregateBLSPublicKeys(keys) assert.NoError(t, err) - assert.True(t, aggPkWithIdentity.Equals(pks[0]), - "incorrect public key %s, should be %s", - aggPkWithIdentity, pks[0]) + assert.True(t, aggPkWithIdentity.Equals(pks[0])) }) t.Run("invalid inputs", func(t *testing.T) { @@ -604,9 +589,7 @@ func TestBLSRemovePubKeys(t *testing.T) { BLSkey, ok := expectedPatrialPk.(*pubKeyBLSBLS12381) require.True(t, ok) - assert.True(t, BLSkey.Equals(partialPk), - "incorrect key %s, should be %s, keys are %s, index is %d", - partialPk, BLSkey, pks, pkToRemoveNum) + assert.True(t, BLSkey.Equals(partialPk)) }) // remove an extra key and check inequality @@ -617,9 +600,7 @@ func TestBLSRemovePubKeys(t *testing.T) { BLSkey, ok := expectedPatrialPk.(*pubKeyBLSBLS12381) require.True(t, ok) - assert.False(t, BLSkey.Equals(partialPk), - "incorrect key %s, should not be %s, keys are %s, index is %d, extra key is %s", - partialPk, BLSkey, pks, pkToRemoveNum, extraPk) + assert.False(t, BLSkey.Equals(partialPk)) }) // specific test to remove all keys @@ -634,9 +615,7 @@ func TestBLSRemovePubKeys(t *testing.T) { BLSRandomPk, ok := randomPk.(*pubKeyBLSBLS12381) require.True(t, ok) - assert.True(t, BLSRandomPk.Equals(randomPkPlusIdentityPk), - "incorrect key %s, should be infinity point, keys are %s", - identityPk, pks) + assert.True(t, BLSRandomPk.Equals(randomPkPlusIdentityPk)) }) // specific test with an empty slice of keys to remove @@ -647,9 +626,7 @@ func TestBLSRemovePubKeys(t *testing.T) { aggBLSkey, ok := aggPk.(*pubKeyBLSBLS12381) require.True(t, ok) - assert.True(t, aggBLSkey.Equals(partialPk), - "incorrect key %s, should be %s", - partialPk, aggBLSkey) + assert.True(t, aggBLSkey.Equals(partialPk)) }) t.Run("invalid inputs", func(t *testing.T) { @@ -702,9 +679,7 @@ func TestBLSBatchVerify(t *testing.T) { t.Run("all signatures are valid", func(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) require.NoError(t, err) - assert.Equal(t, valid, expectedValid, - "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs, sks, input, valid) + assert.Equal(t, valid, expectedValid) }) // valid signatures but indices aren't correct: sig[i] is correct under pks[j] @@ -719,9 +694,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) require.NoError(t, err) expectedValid[i], expectedValid[j] = false, false - assert.Equal(t, valid, expectedValid, - "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs, sks, input, valid) + assert.Equal(t, valid, expectedValid) // restore keys pks[i], pks[j] = pks[j], pks[i] @@ -732,9 +705,7 @@ func TestBLSBatchVerify(t *testing.T) { t.Run("one valid signature", func(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:1], sigs[:1], input, kmac) require.NoError(t, err) - assert.Equal(t, expectedValid[:1], valid, - "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs[:1], sks[:1], input, valid) + assert.Equal(t, expectedValid[:1], valid) }) // pick a random number of invalid signatures @@ -759,9 +730,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) require.NoError(t, err) - assert.Equal(t, expectedValid, valid, - "Verification of %s failed\n private keys are %s\n input is %x\n results is %v", - sigs, sks, input, valid) + assert.Equal(t, expectedValid, valid) }) // all signatures are invalid @@ -776,9 +745,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) require.NoError(t, err) - assert.Equal(t, valid, expectedValid, - "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs, sks, input, valid) + assert.Equal(t, valid, expectedValid) }) // test the empty list case @@ -786,8 +753,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:0], sigs[:0], input, kmac) require.Error(t, err) assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Equal(t, valid, expectedValid[:0], - "verification should fail with empty list key, got %v", valid) + assert.Equal(t, valid, expectedValid[:0]) }) // test incorrect inputs @@ -798,8 +764,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:len(pks)-1], sigs, input, kmac) require.Error(t, err) assert.True(t, IsInvalidInputsError(err)) - assert.Equal(t, valid, expectedValid, - "verification should fail with incorrect input lenghts, got %v", valid) + assert.Equal(t, valid, expectedValid) }) // test wrong hasher @@ -811,8 +776,7 @@ func TestBLSBatchVerify(t *testing.T) { require.Error(t, err) assert.True(t, IsNilHasherError(err)) - assert.Equal(t, valid, expectedValid, - "verification should fail with nil hasher, got %v", valid) + assert.Equal(t, valid, expectedValid) }) // test wrong key @@ -825,8 +789,7 @@ func TestBLSBatchVerify(t *testing.T) { require.Error(t, err) assert.True(t, IsNotBLSKeyError(err)) - assert.Equal(t, valid, expectedValid, - "verification should fail with invalid key, got %v", valid) + assert.Equal(t, valid, expectedValid) }) } @@ -962,9 +925,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { // Verify the aggregated signature valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) require.NoError(t, err) - assert.True(t, valid, - "Verification of %s failed, should be valid, private keys are %s, inputs are %x, input public keys are %s", - aggSig, sks, inputMsgs, inputPks) + assert.True(t, valid) }) // check if one of the signatures is not correct @@ -979,9 +940,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { require.NoError(t, err) valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) require.NoError(t, err) - assert.False(t, valid, - "Verification of %s should fail, private keys are %s, inputs are %x, input public keys are %s", - aggSig, sks, inputMsgs, inputPks) + assert.False(t, valid) }) // test the empty keys case @@ -989,7 +948,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err := VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, valid, "verification should fail with an empty key list") + assert.False(t, valid) }) // test inconsistent input arrays @@ -998,13 +957,13 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs[:sigsNum-1], inputKmacs) assert.Error(t, err) assert.True(t, IsInvalidInputsError(err)) - assert.False(t, valid, "verification should fail with inconsistent messages and hashers") + assert.False(t, valid) // empty key list valid, err = VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, valid, "verification should fail with empty list key") + assert.False(t, valid) // nil hasher tmp := inputKmacs[0] @@ -1012,7 +971,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err = VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsNilHasherError(err)) - assert.False(t, valid, "verification should fail with nil hasher") + assert.False(t, valid) inputKmacs[0] = tmp // wrong key @@ -1021,7 +980,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err = VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsNotBLSKeyError(err)) - assert.False(t, valid, "verification should fail with nil hasher") + assert.False(t, valid) inputPks[0] = tmpPK }) diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go index 06179a01989..8f00a0c77e5 100644 --- a/crypto/sign_test_utils.go +++ b/crypto/sign_test_utils.go @@ -75,15 +75,13 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { // test a valid signature result, err := pk.Verify(s, input, halg) require.NoError(t, err) - assert.True(t, result, fmt.Sprintf( - "Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + assert.True(t, result) // test with a different message input[0] ^= 1 result, err = pk.Verify(s, input, halg) require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + assert.False(t, result) input[0] ^= 1 // test with a valid but different key @@ -92,8 +90,7 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { require.NoError(t, err) result, err = wrongSk.PublicKey().Verify(s, input, halg) require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + assert.False(t, result) // test a wrong signature length invalidLen := rand.Intn(2 * len(s)) // try random invalid lengths @@ -103,9 +100,7 @@ func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { invalidSig := make([]byte, invalidLen) result, err = pk.Verify(invalidSig, input, halg) require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen)) - + assert.False(t, result) } }) } @@ -172,7 +167,7 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { require.Equal(t, read, KeyGenSeedMinLen) require.NoError(t, err) sk, err := GeneratePrivateKey(salg, seed) - assert.Nil(t, err, "the key generation failed") + assert.Nil(t, err) seed[0] ^= 1 // alter the seed to get a new private key distinctSk, err := GeneratePrivateKey(salg, seed) require.NoError(t, err) @@ -180,10 +175,10 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { // check private key encoding skBytes := sk.Encode() skCheck, err := DecodePrivateKey(salg, skBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, sk.Equals(skCheck), "key equality check failed") + require.Nil(t, err) + assert.True(t, sk.Equals(skCheck)) skCheckBytes := skCheck.Encode() - assert.Equal(t, skBytes, skCheckBytes, "keys should be equal") + assert.Equal(t, skBytes, skCheckBytes) distinctSkBytes := distinctSk.Encode() assert.NotEqual(t, skBytes, distinctSkBytes) @@ -192,23 +187,23 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { pkBytes := pk.Encode() pkCheck, err := DecodePublicKey(salg, pkBytes) require.Nil(t, err) - assert.True(t, pk.Equals(pkCheck), "key equality check failed") + assert.True(t, pk.Equals(pkCheck)) pkCheckBytes := pkCheck.Encode() - assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal") + assert.Equal(t, pkBytes, pkCheckBytes) distinctPkBytes := distinctSk.PublicKey().Encode() - assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different") + assert.NotEqual(t, pkBytes, distinctPkBytes) // same for the compressed encoding // skip is BLS is used and compression isn't supported if !(salg == BLSBLS12381 && !isG2Compressed()) { pkComprBytes := pk.EncodeCompressed() pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, pk.Equals(pkComprCheck), "key equality check failed") + require.Nil(t, err) + assert.True(t, pk.Equals(pkComprCheck)) pkCheckComprBytes := pkComprCheck.EncodeCompressed() - assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal") + assert.Equal(t, pkComprBytes, pkCheckComprBytes) distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed() - assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different") + assert.NotEqual(t, pkComprBytes, distinctPkComprBytes) } } }) @@ -228,7 +223,7 @@ func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { groupOrder[BLSBLS12381] = BLS12381Order sk, err := DecodePrivateKey(salg, groupOrder[salg]) - require.Error(t, err, "the key decoding should fail - private key value is too large") + require.Error(t, err) assert.True(t, IsInvalidInputsError(err)) assert.Nil(t, sk) }) @@ -293,12 +288,12 @@ func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorit pk4 := sk4.PublicKey() // tests - assert.True(t, sk1.Equals(sk2), "key equality should return true") - assert.True(t, pk1.Equals(pk2), "key equality should return true") - assert.False(t, sk1.Equals(sk3), "key equality should return false") - assert.False(t, pk1.Equals(pk3), "key equality should return false") - assert.False(t, sk1.Equals(sk4), "key equality should return false") - assert.False(t, pk1.Equals(pk4), "key equality should return false") + assert.True(t, sk1.Equals(sk2)) + assert.True(t, pk1.Equals(pk2)) + assert.False(t, sk1.Equals(sk3)) + assert.False(t, pk1.Equals(pk3)) + assert.False(t, sk1.Equals(sk4)) + assert.False(t, pk1.Equals(pk4)) }) } From 8ffac589dc4dec581c91715e4e261add09625715 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 17 Aug 2023 14:42:51 -0600 Subject: [PATCH 135/200] add interface implementation sanity checks --- crypto/bls.go | 6 ++++++ crypto/ecdsa.go | 8 +++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/crypto/bls.go b/crypto/bls.go index 447a203033b..b5ed13bd83d 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -349,6 +349,9 @@ func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (Publ } // prKeyBLSBLS12381 is the private key of BLS using BLS12_381, it implements PrivateKey + +var _ PrivateKey = (*prKeyBLSBLS12381)(nil) + type prKeyBLSBLS12381 struct { // public key pk *pubKeyBLSBLS12381 @@ -426,6 +429,9 @@ func (sk *prKeyBLSBLS12381) String() string { // pubKeyBLSBLS12381 is the public key of BLS using BLS12_381, // it implements PublicKey. + +var _ PublicKey = (*pubKeyBLSBLS12381)(nil) + type pubKeyBLSBLS12381 struct { // The package guarantees an instance is only created with a point // on the correct G2 subgroup. No membership check is needed when the diff --git a/crypto/ecdsa.go b/crypto/ecdsa.go index dca3604570a..67d97e9a854 100644 --- a/crypto/ecdsa.go +++ b/crypto/ecdsa.go @@ -321,7 +321,10 @@ func (a *ecdsaAlgo) decodePublicKeyCompressed(pkBytes []byte) (PublicKey, error) return &pubKeyECDSA{a, goPubKey}, nil } -// prKeyECDSA is the private key of ECDSA, it implements the generic PrivateKey +// prKeyECDSA is the private key of ECDSA, it implements the interface PrivateKey + +var _ PrivateKey = (*prKeyECDSA)(nil) + type prKeyECDSA struct { // the signature algo alg *ecdsaAlgo @@ -392,6 +395,9 @@ func (sk *prKeyECDSA) String() string { } // pubKeyECDSA is the public key of ECDSA, it implements PublicKey + +var _ PublicKey = (*pubKeyECDSA)(nil) + type pubKeyECDSA struct { // the signature algo alg *ecdsaAlgo From 8415a45958a8ea41d863a7b3d75ba0a6fd743cde Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 17 Aug 2023 23:58:36 -0600 Subject: [PATCH 136/200] add faster scalar mult in E2 for small expos --- crypto/bls12381_utils.c | 50 ++++++++++++++++++++++++++--------- crypto/bls12381_utils.h | 3 ++- crypto/bls12381_utils_test.go | 3 ++- crypto/dkg_core.c | 2 +- crypto/sign_test_utils.go | 3 ++- 5 files changed, 45 insertions(+), 16 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index d88bfa3aaa8..97725545b26 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -95,6 +95,7 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) { // if base = b*R, res = b^expo * R // In general, res = base^expo * R^(-expo+1) // `expo` is encoded as a little-endian limb_t table of length `expo_len`. +// `expo` must be non-zero. // TODO: clean up? void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) { // mask of the most significant bit @@ -103,15 +104,15 @@ void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_le int index = 0; expo += expo_len; - // Treat most significant zero limbs + // process most significant zero limbs while((index < expo_len) && (*(--expo) == 0)) { index++; } - // Treat the most significant zero bits + // process the most significant zero bits while((*expo & mask) == 0) { mask >>= 1; } - // Treat the first `1` bit + // process the first `1` bit Fr_copy(res, base); mask >>= 1; // Scan all limbs of the exponent @@ -909,6 +910,11 @@ void E2_add(E2* res, const E2* a, const E2* b) { POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); } +// generic point double that must handle point at infinity +void E2_double(E2* res, const E2* a) { + POINTonE2_double((POINTonE2*)res, (POINTonE2*)a); +} + // Point negation: res = -a void E2_neg(E2* res, const E2* a) { // TODO: optimize @@ -924,14 +930,34 @@ void E2_mult(E2* res, const E2* p, const Fr* expo) { vec_zero(&tmp, sizeof(tmp)); } -// Exponentiation of a generic point `a` in E2 by a byte exponent. +// Exponentiation of a generic point `a` in E2 by a byte exponent, +// using a classic double-and-add algorithm (non constant-time) void E2_mult_small_expo(E2* res, const E2* p, const byte expo) { - pow256 pow_expo; - vec_zero(&pow_expo, sizeof(pow256)); - pow_expo[0] = expo; // `pow256` uses bytes little endian. - // TODO: to bench against a specific version of mult with 8 bits expo - POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)p, pow_expo); - pow_expo[0] = 0; + // return early if expo is zero + if (expo == 0) { + E2_set_infty(res); + return; + } + // expo is non zero + + byte mask = 1<<7; + // process the most significant zero bits + while((expo & mask) == 0) { + mask >>= 1; + } + + // process the first `1` bit + E2 tmp; + E2_copy(&tmp, p); + mask >>= 1; + // scan the remaining bits + for ( ; mask != 0 ; mask >>= 1 ) { + E2_double(&tmp, &tmp); + if (expo & mask) { + E2_add(&tmp, &tmp, p); + } + } + E2_copy(res, &tmp); } // Exponentiation of generator g2 of G2, res = expo.g2 @@ -1126,8 +1152,8 @@ void E1_print_(char* s, const E1* p, const int jacob) { void E2_print_(char* s, const E2* p, const int jacob) { E2 a; E2_copy(&a, p); - if (strlen(s)) if (!jacob) E2_to_affine(&a, &a); - printf("[%s]:\n", s); + if (!jacob) E2_to_affine(&a, &a); + if (strlen(s)) printf("[%s]:\n", s); Fp2_print_("", &(a.x)); Fp2_print_("", &(a.y)); if (jacob) Fp2_print_("", &(a.z)); diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index d35e0298c59..ae899877be2 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -113,6 +113,7 @@ void G2_mult_gen(E2*, const Fr*); void E2_mult(E2*, const E2*, const Fr*); void E2_mult_small_expo(E2*, const E2*, const byte); void E2_add(E2* res, const E2* a, const E2* b); +void E2_double(E2* res, const E2* a); void E2_neg(E2*, const E2*); void E2_sum_vector(E2*, const E2*, const int); void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); @@ -129,7 +130,7 @@ void Fp12_multi_pairing(Fp12*, const E1*, const E2*, const int); void xmd_sha256(byte *, int, byte *, int, byte *, int); // Debugging related functions -#define DEBUG 0 +#define DEBUG 1 #if (DEBUG == 1) #include void bytes_print_(char*, byte*, int); diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 067ac979f7e..a9efd543ed1 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -1,6 +1,7 @@ package crypto import ( + "crypto/rand" "encoding/hex" mrand "math/rand" "testing" @@ -54,7 +55,7 @@ func TestScalarMultBLS12381(t *testing.T) { // G1 and G2 scalar multiplication func BenchmarkScalarMult(b *testing.B) { seed := make([]byte, securityBits/8) - _, err := mrand.Read(seed) + _, err := rand.Read(seed) require.NoError(b, err) var expo scalar diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 15e8e0c48b3..674973e1d8a 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -35,7 +35,7 @@ void Fr_polynomial_image(Fr* image, E2* y, const Fr* a, const int degree, const static void E2_polynomial_image(E2* y, const E2* A, const int degree, const byte x){ E2_set_infty(y); for (int i = degree; i >= 0 ; i--) { - E2_mult_small_expo(y, y, x); // TODO: to bench against a specific version of mult with 8 bits expo + E2_mult_small_expo(y, y, x); E2_add(y, y, &A[i]); } } diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go index 8f00a0c77e5..9ecc684a4be 100644 --- a/crypto/sign_test_utils.go +++ b/crypto/sign_test_utils.go @@ -5,6 +5,7 @@ import ( "fmt" mrand "math/rand" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -13,7 +14,7 @@ import ( ) func getPRG(t *testing.T) *mrand.Rand { - random := int64(1685491239186156000) //time.Now().UnixNano() + random := time.Now().UnixNano() t.Logf("rng seed is %d", random) rng := mrand.New(mrand.NewSource(random)) return rng From a408dec46ae1e1c5366dfe89bef212d43b4438b6 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 18 Aug 2023 12:35:33 -0600 Subject: [PATCH 137/200] more implementation check sanity check --- crypto/bls_thresholdsign.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 2f05ed72c42..9451f4fb6dc 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -34,6 +34,8 @@ import ( // blsThresholdSignatureParticipant implements ThresholdSignatureParticipant // based on the BLS signature scheme +var _ ThresholdSignatureParticipant = (*blsThresholdSignatureParticipant)(nil) + type blsThresholdSignatureParticipant struct { // embed the follower *blsThresholdSignatureInspector @@ -45,6 +47,8 @@ type blsThresholdSignatureParticipant struct { // blsThresholdSignatureInspector implements ThresholdSignatureInspector // based on the BLS signature scheme +var _ ThresholdSignatureInspector = (*blsThresholdSignatureInspector)(nil) + type blsThresholdSignatureInspector struct { // size of the group size int From 65ee3bf1ab143e77d74e29a3ebcfdc9e3ac72c38 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 18 Aug 2023 13:13:39 -0600 Subject: [PATCH 138/200] clean up some todos and add global g2 key --- crypto/bls12381_utils.c | 1 - crypto/bls12381_utils.go | 5 +++++ crypto/bls_core.c | 2 +- crypto/bls_multisig.go | 10 ++-------- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 97725545b26..0614af773fe 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -165,7 +165,6 @@ static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES]) } // internal type of BLST `pow256` uses bytes little endian. -// TODO: check endianness!! static void pow256_from_Fr(pow256 ret, const Fr* in) { le_bytes_from_limbs(ret, (limb_t*)in, Fr_BYTES); } diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index f071b7b9f43..40580ca7239 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -67,6 +67,8 @@ var g2SerHeader byte // g2 // `g1“ serialization var g1Serialization []byte +var g2PublicKey pubKeyBLSBLS12381 + // initialization of BLS12-381 curve func initBLS12381() { if isG1Compressed() { @@ -80,6 +82,9 @@ func initBLS12381() { } else { g2SerHeader = 0x40 } + // set a global point to infinity + C.E2_set_infty((*C.E2)(&g2PublicKey.point)) + g2PublicKey.isIdentity = true } func (a *scalar) String() string { diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 0771269ed86..39b9e243fd1 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -398,7 +398,7 @@ void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, Fr_set_limb(&one, 1); Fr_add(&r, &r, &one); // multiply public key and signature by the same random exponent r - E2_mult(&pks[i], &pks_input[i], &r); // TODO: faster version for short expos? + E2_mult(&pks[i], &pks_input[i], &r); E1_mult(&sigs[i], &sigs[i], &r); } } diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index 7adbb0c1f45..7f57cd09888 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -192,15 +192,9 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { } // IdentityBLSPublicKey returns an identity public key which corresponds to the point -// at infinity in G2 (identity element of G2). -// TODO: return a constant key instead of a newly allocated one +// at infinity in G2 (identity element g2). func IdentityBLSPublicKey() PublicKey { - - identity := *newPubKeyBLSBLS12381(nil) - // set the point to infinity - C.E2_set_infty((*C.E2)(&identity.point)) - identity.isIdentity = true - return &identity + return &g2PublicKey } // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key. From b3797441d28be8009b4262218e45fbbeb5529e22 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 18 Aug 2023 13:45:51 -0600 Subject: [PATCH 139/200] address more TODOs --- crypto/bls12381_utils.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 0614af773fe..b2385baa37f 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -95,8 +95,7 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) { // if base = b*R, res = b^expo * R // In general, res = base^expo * R^(-expo+1) // `expo` is encoded as a little-endian limb_t table of length `expo_len`. -// `expo` must be non-zero. -// TODO: clean up? +// TODO: could be deleted void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) { // mask of the most significant bit const limb_t msb_mask = (limb_t)1<<((sizeof(limb_t)<<3)-1); @@ -108,29 +107,38 @@ void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_le while((index < expo_len) && (*(--expo) == 0)) { index++; } + // if expo is zero + if (index == expo_len) { + Fr_copy(res, base); + return; + } + // expo is non zero // process the most significant zero bits while((*expo & mask) == 0) { mask >>= 1; } + Fr tmp; // process the first `1` bit - Fr_copy(res, base); + Fr_copy(&tmp, base); mask >>= 1; // Scan all limbs of the exponent for ( ; index < expo_len; expo--) { // Scan all bits for ( ; mask != 0 ; mask >>= 1 ) { // square - Fr_squ_montg(res, res); + Fr_squ_montg(&tmp, &tmp); // multiply if (*expo & mask) { - Fr_mul_montg(res, res ,base); + Fr_mul_montg(&tmp, &tmp ,base); } } mask = msb_mask; index++; } + Fr_copy(res, &tmp); } +// TODO: could be deleted void Fr_inv_exp_montg(Fr *res, const Fr *a) { Fr r_2; Fr_copy(&r_2, (Fr*)BLS12_381_r); @@ -217,8 +225,7 @@ void Fr_write_bytes(byte *bin, const Fr* a) { // maps big-endian bytes into an Fr element using modular reduction // Input is byte-big-endian, output is Fr (internally vec256) // TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); -static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) -{ +static void Fr_from_be_bytes(Fr* out, const byte *bytes, size_t n) { Fr digit, radix; Fr_set_zero(out); Fr_copy(&radix, (Fr*)BLS12_381_rRR); // R^2 @@ -610,7 +617,6 @@ void E1_add(E1* res, const E1* a, const E1* b) { // Point negation: res = -a void E1_neg(E1* res, const E1* a) { - // TODO: optimize E1_copy(res, a); POINTonE1_cneg((POINTonE1*)res, 1); } @@ -916,7 +922,6 @@ void E2_double(E2* res, const E2* a) { // Point negation: res = -a void E2_neg(E2* res, const E2* a) { - // TODO: optimize E2_copy(res, a); POINTonE2_cneg((POINTonE2*)res, 1); } From 9d6c7c7d7c245c86ae97544a5f213d21a67545db Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 18 Aug 2023 14:14:10 -0600 Subject: [PATCH 140/200] add c-formatting target --- crypto/.clang-format | 192 ++++ crypto/Makefile | 9 + crypto/bls12381_utils.c | 1593 ++++++++++++++-------------- crypto/bls12381_utils.h | 210 ++-- crypto/bls_core.c | 789 +++++++------- crypto/bls_include.h | 20 +- crypto/bls_thresholdsign_core.c | 188 ++-- crypto/bls_thresholdsign_include.h | 6 +- crypto/blst_include.h | 55 +- crypto/dkg_core.c | 114 +- crypto/dkg_include.h | 14 +- 11 files changed, 1738 insertions(+), 1452 deletions(-) create mode 100644 crypto/.clang-format diff --git a/crypto/.clang-format b/crypto/.clang-format new file mode 100644 index 00000000000..48b2c678323 --- /dev/null +++ b/crypto/.clang-format @@ -0,0 +1,192 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveMacros: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortEnumsOnASingleLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: true +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +QualifierAlignment: Leave +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +PackConstructorInitializers: BinPack +BasedOnStyle: '' +ConstructorInitializerAllOnOneLineOrOnePerLine: false +AllowAllConstructorInitializersOnNextLine: true +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseLabels: false +IndentCaseBlocks: false +IndentGotoLabels: true +IndentPPDirectives: None +IndentExternBlock: AfterExternBlock +IndentRequires: false +IndentWidth: 2 +IndentWrappedFunctionNames: false +InsertTrailingCommas: None +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +LambdaBodyIndentation: Signature +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PenaltyIndentedWhitespace: 0 +PointerAlignment: Right +PPIndentWidth: -1 +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + BeforeNonEmptyParentheses: false +SpaceAroundPointerQualifiers: Default +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +BitFieldColonSpacing: Both +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseCRLF: false +UseTab: Never +WhitespaceSensitiveMacros: + - STRINGIZE + - PP_STRINGIZE + - BOOST_PP_STRINGIZE + - NS_SWIFT_NAME + - CF_SWIFT_NAME +... + diff --git a/crypto/Makefile b/crypto/Makefile index 04cc9ae19d8..28e7a5f6f2f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -29,6 +29,15 @@ else endif CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) +# format +.PHONY: c-format +c-format: + clang-format -style=llvm -dump-config > .clang-format + clang-format -i *.c + clang-format -i *.h + rm -f .clang-format + git diff --exit-code + # test all packages .PHONY: test test: diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index b2385baa37f..665f3853236 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1,10 +1,10 @@ // this file contains utility functions for the curve BLS 12-381 -// these tools are shared by the BLS signature scheme, the BLS based threshold signature -// and the BLS distributed key generation protocols +// these tools are shared by the BLS signature scheme, the BLS based threshold +// signature and the BLS distributed key generation protocols #include "bls12381_utils.h" -#include "bls_include.h" #include "assert.h" +#include "bls_include.h" // compile all blst C src along with this file #include "blst_src.c" @@ -12,83 +12,87 @@ // ------------------- Fr utilities // Montgomery constant R related to the curve order r -// R mod r = (1<<256)%r -const Fr BLS12_381_rR = {{ \ - TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), \ - TO_LIMB_T(0x5884b7fa00034802), TO_LIMB_T(0x00000001fffffffe), \ +// R mod r = (1<<256)%r +const Fr BLS12_381_rR = {{ + TO_LIMB_T(0x1824b159acc5056f), + TO_LIMB_T(0x998c4fefecbc4ff5), + TO_LIMB_T(0x5884b7fa00034802), + TO_LIMB_T(0x00000001fffffffe), }}; // returns true if a == 0 and false otherwise -bool Fr_is_zero(const Fr* a) { - return bytes_are_zero((const byte*)a, sizeof(Fr)); +bool Fr_is_zero(const Fr *a) { + return bytes_are_zero((const byte *)a, sizeof(Fr)); } // returns true if a == b and false otherwise -bool Fr_is_equal(const Fr* a, const Fr* b) { - return vec_is_equal(a, b, sizeof(Fr)); +bool Fr_is_equal(const Fr *a, const Fr *b) { + return vec_is_equal(a, b, sizeof(Fr)); } // sets `a` to limb `l` -void Fr_set_limb(Fr* a, const limb_t l){ - vec_zero((byte*)a + sizeof(limb_t), sizeof(Fr) - sizeof(limb_t)); - *((limb_t*)a) = l; +void Fr_set_limb(Fr *a, const limb_t l) { + vec_zero((byte *)a + sizeof(limb_t), sizeof(Fr) - sizeof(limb_t)); + *((limb_t *)a) = l; } -void Fr_copy(Fr* res, const Fr* a) { - if ((uptr_t)a==(uptr_t)res) { - return; - } - vec_copy((byte*)res, (byte*)a, sizeof(Fr)); +void Fr_copy(Fr *res, const Fr *a) { + if ((uptr_t)a == (uptr_t)res) { + return; + } + vec_copy((byte *)res, (byte *)a, sizeof(Fr)); } // sets `a` to 0 -void Fr_set_zero(Fr* a){ - vec_zero((byte*)a, sizeof(Fr)); -} +void Fr_set_zero(Fr *a) { vec_zero((byte *)a, sizeof(Fr)); } void Fr_add(Fr *res, const Fr *a, const Fr *b) { - add_mod_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r); + add_mod_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r); } void Fr_sub(Fr *res, const Fr *a, const Fr *b) { - sub_mod_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r); + sub_mod_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r); } void Fr_neg(Fr *res, const Fr *a) { - cneg_mod_256((limb_t*)res, (limb_t*)a, 1, BLS12_381_r); + cneg_mod_256((limb_t *)res, (limb_t *)a, 1, BLS12_381_r); } // res = a*b*R^(-1) void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b) { - mul_mont_sparse_256((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_r, r0); + mul_mont_sparse_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r, r0); } // res = a^2 * R^(-1) void Fr_squ_montg(Fr *res, const Fr *a) { - sqr_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0); + sqr_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_r, r0); } // res = a*R void Fr_to_montg(Fr *res, const Fr *a) { - mul_mont_sparse_256((limb_t*)res, (limb_t*)a, BLS12_381_rRR, BLS12_381_r, r0); + mul_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_rRR, BLS12_381_r, + r0); } // res = a*R^(-1) void Fr_from_montg(Fr *res, const Fr *a) { - from_mont_256((limb_t*)res, (limb_t*)a, BLS12_381_r, r0); + from_mont_256((limb_t *)res, (limb_t *)a, BLS12_381_r, r0); } // res = a^(-1)*R void Fr_inv_montg_eucl(Fr *res, const Fr *a) { - // copied and modified from BLST code - // Copyright Supranational LLC - static const vec256 rx2 = { /* left-aligned value of the modulus */ - TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), - TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), - }; - vec512 temp; - ct_inverse_mod_256(temp, (limb_t*)a, BLS12_381_r, rx2); - redc_mont_256((limb_t*)res, temp, BLS12_381_r, r0); + // copied and modified from BLST code + // Copyright Supranational LLC + static const vec256 rx2 = { + /* left-aligned value of the modulus */ + TO_LIMB_T(0xfffffffe00000002), + TO_LIMB_T(0xa77b4805fffcb7fd), + TO_LIMB_T(0x6673b0101343b00a), + TO_LIMB_T(0xe7db4ea6533afa90), + }; + vec512 temp; + ct_inverse_mod_256(temp, (limb_t *)a, BLS12_381_r, rx2); + redc_mont_256((limb_t *)res, temp, BLS12_381_r, r0); } // result is in Montgomery form if base is in montgomery form @@ -96,85 +100,85 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) { // In general, res = base^expo * R^(-expo+1) // `expo` is encoded as a little-endian limb_t table of length `expo_len`. // TODO: could be deleted -void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len) { - // mask of the most significant bit - const limb_t msb_mask = (limb_t)1<<((sizeof(limb_t)<<3)-1); - limb_t mask = msb_mask; - int index = 0; - - expo += expo_len; - // process most significant zero limbs - while((index < expo_len) && (*(--expo) == 0)) { - index++; - } - // if expo is zero - if (index == expo_len) { - Fr_copy(res, base); - return; - } - // expo is non zero - // process the most significant zero bits - while((*expo & mask) == 0) { - mask >>= 1; +void Fr_exp_montg(Fr *res, const Fr *base, const limb_t *expo, + const int expo_len) { + // mask of the most significant bit + const limb_t msb_mask = (limb_t)1 << ((sizeof(limb_t) << 3) - 1); + limb_t mask = msb_mask; + int index = 0; + + expo += expo_len; + // process most significant zero limbs + while ((index < expo_len) && (*(--expo) == 0)) { + index++; + } + // if expo is zero + if (index == expo_len) { + Fr_copy(res, base); + return; + } + // expo is non zero + // process the most significant zero bits + while ((*expo & mask) == 0) { + mask >>= 1; + } + Fr tmp; + // process the first `1` bit + Fr_copy(&tmp, base); + mask >>= 1; + // Scan all limbs of the exponent + for (; index < expo_len; expo--) { + // Scan all bits + for (; mask != 0; mask >>= 1) { + // square + Fr_squ_montg(&tmp, &tmp); + // multiply + if (*expo & mask) { + Fr_mul_montg(&tmp, &tmp, base); + } } - Fr tmp; - // process the first `1` bit - Fr_copy(&tmp, base); - mask >>= 1; - // Scan all limbs of the exponent - for ( ; index < expo_len; expo--) { - // Scan all bits - for ( ; mask != 0 ; mask >>= 1 ) { - // square - Fr_squ_montg(&tmp, &tmp); - // multiply - if (*expo & mask) { - Fr_mul_montg(&tmp, &tmp ,base); - } - } - mask = msb_mask; - index++; - } - Fr_copy(res, &tmp); + mask = msb_mask; + index++; + } + Fr_copy(res, &tmp); } // TODO: could be deleted void Fr_inv_exp_montg(Fr *res, const Fr *a) { - Fr r_2; - Fr_copy(&r_2, (Fr*)BLS12_381_r); - r_2.limbs[0] -= 2; - Fr_exp_montg(res, a, (limb_t*)&r_2, 4); + Fr r_2; + Fr_copy(&r_2, (Fr *)BLS12_381_r); + r_2.limbs[0] -= 2; + Fr_exp_montg(res, a, (limb_t *)&r_2, 4); } // computes the sum of the array elements and writes the sum in jointx -void Fr_sum_vector(Fr* jointx, const Fr x[], const int len) { - Fr_set_zero(jointx); - for (int i=0; i Fr_BYTES) { - // limbs_from_be_bytes works for both limb endiannesses - limbs_from_be_bytes((limb_t*)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i - Fr_mul_montg(&digit, &digit, &radix); // l_i * R^i (i is the loop number starting at 1) - Fr_add(out, out, &digit); - Fr_mul_montg(&radix, &radix, (Fr*)BLS12_381_rRR); // R^(i+1) - n -= Fr_BYTES; - } - Fr_set_zero(&digit); - limbs_from_be_bytes((limb_t*)&digit, p - n, n); - Fr_mul_montg(&digit, &digit, &radix); +// TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t +// n0); +static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) { + Fr digit, radix; + Fr_set_zero(out); + Fr_copy(&radix, (Fr *)BLS12_381_rRR); // R^2 + + byte *p = (byte *)bytes + n; + while (n > Fr_BYTES) { + // limbs_from_be_bytes works for both limb endiannesses + limbs_from_be_bytes((limb_t *)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i + Fr_mul_montg(&digit, &digit, + &radix); // l_i * R^i (i is the loop number starting at 1) Fr_add(out, out, &digit); - // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n - // reduce the extra R - Fr_from_montg(out, out); - // clean up possible sensitive data - Fr_set_zero(&digit); + Fr_mul_montg(&radix, &radix, (Fr *)BLS12_381_rRR); // R^(i+1) + n -= Fr_BYTES; + } + Fr_set_zero(&digit); + limbs_from_be_bytes((limb_t *)&digit, p - n, n); + Fr_mul_montg(&digit, &digit, &radix); + Fr_add(out, out, &digit); + // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n + // reduce the extra R + Fr_from_montg(out, out); + // clean up possible sensitive data + Fr_set_zero(&digit); } // Reads a scalar from an array and maps it to Fr using modular reduction. // Input is byte-big-endian as used by the external APIs. // It returns true if scalar is zero and false otherwise. -bool map_bytes_to_Fr(Fr* a, const byte* bin, int len) { - Fr_from_be_bytes(a, bin, len); - return Fr_is_zero(a); +bool map_bytes_to_Fr(Fr *a, const byte *bin, int len) { + Fr_from_be_bytes(a, bin, len); + return Fr_is_zero(a); } // ------------------- Fp utilities // Montgomery constants related to the prime p -const Fp BLS12_381_pR = { ONE_MONT_P }; /* R mod p = (1<<384)%p */ +const Fp BLS12_381_pR = {ONE_MONT_P}; /* R mod p = (1<<384)%p */ // sets `a` to 0 -static void Fp_set_zero(Fp* a){ - vec_zero((byte*)a, sizeof(Fp)); -} +static void Fp_set_zero(Fp *a) { vec_zero((byte *)a, sizeof(Fp)); } // sets `a` to limb `l` -static void Fp_set_limb(Fp* a, const limb_t l){ - vec_zero((byte*)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t)); - *((limb_t*)a) = l; +static void Fp_set_limb(Fp *a, const limb_t l) { + vec_zero((byte *)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t)); + *((limb_t *)a) = l; } -void Fp_copy(Fp* res, const Fp* a) { - if ((uptr_t)a==(uptr_t)res) { - return; - } - vec_copy((byte*)res, (byte*)a, sizeof(Fp)); +void Fp_copy(Fp *res, const Fp *a) { + if ((uptr_t)a == (uptr_t)res) { + return; + } + vec_copy((byte *)res, (byte *)a, sizeof(Fp)); } static void Fp_add(Fp *res, const Fp *a, const Fp *b) { - add_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P); + add_mod_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P); } static void Fp_sub(Fp *res, const Fp *a, const Fp *b) { - sub_mod_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P); + sub_mod_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P); } static void Fp_neg(Fp *res, const Fp *a) { - cneg_mod_384((limb_t*)res, (limb_t*)a, 1, BLS12_381_P); + cneg_mod_384((limb_t *)res, (limb_t *)a, 1, BLS12_381_P); } -// checks if `a` is a quadratic residue in Fp. If yes, it computes +// checks if `a` is a quadratic residue in Fp. If yes, it computes // the square root in `res`. -// +// // The boolean output is valid whether `a` is in Montgomery form or not, // since montgomery constant `R` is a quadratic residue. // However, the square root is valid only if `a` is in montgomery form. -static bool Fp_sqrt_montg(Fp *res, const Fp* a) { - return sqrt_fp((limb_t*)res, (limb_t*)a); +static bool Fp_sqrt_montg(Fp *res, const Fp *a) { + return sqrt_fp((limb_t *)res, (limb_t *)a); } -static bool Fp_check(const Fp* in) { - // use same method as in BLST internal function - // which seems the most efficient. The method uses the assembly-based - // modular addition instead of limbs comparison - Fp temp; - Fp_add(&temp, in, &ZERO_384); - return vec_is_equal(&temp, in, Fp_BYTES); - // no need to clear `tmp` as no use-case involves sensitive data being passed as `in` +static bool Fp_check(const Fp *in) { + // use same method as in BLST internal function + // which seems the most efficient. The method uses the assembly-based + // modular addition instead of limbs comparison + Fp temp; + Fp_add(&temp, in, &ZERO_384); + return vec_is_equal(&temp, in, Fp_BYTES); + // no need to clear `tmp` as no use-case involves sensitive data being passed + // as `in` } // res = a*b*R^(-1) void Fp_mul_montg(Fp *res, const Fp *a, const Fp *b) { - mul_mont_384((limb_t*)res, (limb_t*)a, (limb_t*)b, BLS12_381_P, p0); + mul_mont_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P, p0); } // res = a^2 * R^(-1) void Fp_squ_montg(Fp *res, const Fp *a) { - sqr_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_P, p0); + sqr_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0); } // res = a*R void Fp_to_montg(Fp *res, const Fp *a) { - mul_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_RR, BLS12_381_P, p0); + mul_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_RR, BLS12_381_P, p0); } // res = a*R^(-1) void Fp_from_montg(Fp *res, const Fp *a) { - from_mont_384((limb_t*)res, (limb_t*)a, BLS12_381_P, p0); + from_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0); } // reads a scalar in `a` and checks it is a valid Fp element (a < p). @@ -338,82 +345,81 @@ void Fp_from_montg(Fp *res, const Fp *a) { // returns: // - BAD_ENCODING if the length is invalid // - BAD_VALUE if the scalar isn't in Fp -// - VALID if the scalar is valid -ERROR Fp_read_bytes(Fp* a, const byte *bin, int len) { - if (len != Fp_BYTES) { - return BAD_ENCODING; - } - limbs_from_be_bytes((limb_t*)a, bin, Fp_BYTES); - // compare read scalar to p - if (!Fp_check(a)) { - return BAD_VALUE; - } - return VALID; +// - VALID if the scalar is valid +ERROR Fp_read_bytes(Fp *a, const byte *bin, int len) { + if (len != Fp_BYTES) { + return BAD_ENCODING; + } + limbs_from_be_bytes((limb_t *)a, bin, Fp_BYTES); + // compare read scalar to p + if (!Fp_check(a)) { + return BAD_VALUE; + } + return VALID; } - -// write Fp element to bin and assume `bin` has `Fp_BYTES` allocated bytes. -void Fp_write_bytes(byte *bin, const Fp* a) { - be_bytes_from_limbs(bin, (limb_t*)a, Fp_BYTES); +// write Fp element to bin and assume `bin` has `Fp_BYTES` allocated bytes. +void Fp_write_bytes(byte *bin, const Fp *a) { + be_bytes_from_limbs(bin, (limb_t *)a, Fp_BYTES); } // returns the sign of y. // 1 if y > (p - 1)/2 and 0 otherwise. // y is in montgomery form -static byte Fp_get_sign(const Fp* y) { - // BLST's sgn0_pty_mont_384 requires input to be in Montg form. - // The needed sign bit is on position 1 ! - return (sgn0_pty_mont_384((const limb_t*)y, BLS12_381_P, p0)>>1) & 1; +static byte Fp_get_sign(const Fp *y) { + // BLST's sgn0_pty_mont_384 requires input to be in Montg form. + // The needed sign bit is on position 1 ! + return (sgn0_pty_mont_384((const limb_t *)y, BLS12_381_P, p0) >> 1) & 1; } // ------------------- Fp^2 utilities // sets `a` to limb `l` -static void Fp2_set_limb(Fp2* a, const limb_t l){ - Fp_set_limb(&real(a), l); - Fp_set_zero(&imag(a)); +static void Fp2_set_limb(Fp2 *a, const limb_t l) { + Fp_set_limb(&real(a), l); + Fp_set_zero(&imag(a)); } static void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) { - add_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P); + add_mod_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P); } static void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) { - sub_mod_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P); + sub_mod_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P); } static void Fp2_neg(Fp2 *res, const Fp2 *a) { - cneg_mod_384(real(res), real(a), 1, BLS12_381_P); - cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P); + cneg_mod_384(real(res), real(a), 1, BLS12_381_P); + cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P); } // res = a*b in montgomery form static void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) { - mul_mont_384x((vec384*)res, (vec384*)a, (vec384*)b, BLS12_381_P, p0); + mul_mont_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P, p0); } // res = a^2 in montgomery form static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { - sqr_mont_384x((vec384*)res, (vec384*)a, BLS12_381_P, p0); + sqr_mont_384x((vec384 *)res, (vec384 *)a, BLS12_381_P, p0); } -// checks if `a` is a quadratic residue in Fp^2. If yes, it computes +// checks if `a` is a quadratic residue in Fp^2. If yes, it computes // the square root in `res`. -// +// // The boolean output is valid whether `a` is in Montgomery form or not, // since montgomery constant `R` is a quadratic residue. // However, the square root is valid only if `a` is in montgomery form. -static bool Fp2_sqrt_montg(Fp2 *res, const Fp2* a) { - return sqrt_fp2((vec384*)res, (vec384*)a); +static bool Fp2_sqrt_montg(Fp2 *res, const Fp2 *a) { + return sqrt_fp2((vec384 *)res, (vec384 *)a); } // returns the sign of y. // sign(y_0) if y_1 = 0, else sign(y_1) // y coordinates must be in montgomery form -static byte Fp2_get_sign(Fp2* y) { - // BLST's sgn0_pty_mont_384x requires input to be in Montg form. - // The needed sign bit is on position 1 ! - return (sgn0_pty_mont_384x((vec384*)y, BLS12_381_P, p0)>>1) & 1; +static byte Fp2_get_sign(Fp2 *y) { + // BLST's sgn0_pty_mont_384x requires input to be in Montg form. + // The needed sign bit is on position 1 ! + return (sgn0_pty_mont_384x((vec384 *)y, BLS12_381_P, p0) >> 1) & 1; } // reads an Fp^2 element in `a`. @@ -422,745 +428,762 @@ static byte Fp2_get_sign(Fp2* y) { // returns: // - BAD_ENCODING if the length is invalid // - BAD_VALUE if the scalar isn't in Fp -// - VALID if the scalar is valid -static ERROR Fp2_read_bytes(Fp2* a, const byte *bin, int len) { - if (len != Fp2_BYTES) { - return BAD_ENCODING; - } - ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES); - if (ret != VALID) { - return ret; - } - ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES); - if ( ret != VALID) { - return ret; - } - return VALID; -} - -// write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes. -void Fp2_write_bytes(byte *bin, const Fp2* a) { - Fp_write_bytes(bin, &real(a)); - Fp_write_bytes(bin + Fp_BYTES, &imag(a)); +// - VALID if the scalar is valid +static ERROR Fp2_read_bytes(Fp2 *a, const byte *bin, int len) { + if (len != Fp2_BYTES) { + return BAD_ENCODING; + } + ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES); + if (ret != VALID) { + return ret; + } + ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES); + if (ret != VALID) { + return ret; + } + return VALID; +} + +// write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes. +void Fp2_write_bytes(byte *bin, const Fp2 *a) { + Fp_write_bytes(bin, &real(a)); + Fp_write_bytes(bin + Fp_BYTES, &imag(a)); } // ------------------- E1 utilities -void E1_copy(E1* res, const E1* p) { - if ((uptr_t)p == (uptr_t)res) { - return; - } - vec_copy(res, p, sizeof(E1)); +void E1_copy(E1 *res, const E1 *p) { + if ((uptr_t)p == (uptr_t)res) { + return; + } + vec_copy(res, p, sizeof(E1)); } // checks p1 == p2 -bool E1_is_equal(const E1* p1, const E1* p2) { - // `POINTonE1_is_equal` includes the infinity case - return POINTonE1_is_equal((const POINTonE1*)p1, (const POINTonE1*)p2); +bool E1_is_equal(const E1 *p1, const E1 *p2) { + // `POINTonE1_is_equal` includes the infinity case + return POINTonE1_is_equal((const POINTonE1 *)p1, (const POINTonE1 *)p2); } // compare p to infinity -bool E1_is_infty(const E1* p) { - // BLST infinity points are defined by Z=0 - return vec_is_zero(p->z, sizeof(p->z)); +bool E1_is_infty(const E1 *p) { + // BLST infinity points are defined by Z=0 + return vec_is_zero(p->z, sizeof(p->z)); } // set p to infinity -void E1_set_infty(E1* p) { - // BLST infinity points are defined by Z=0 - vec_zero(p->z, sizeof(p->z)); +void E1_set_infty(E1 *p) { + // BLST infinity points are defined by Z=0 + vec_zero(p->z, sizeof(p->z)); } // converts an E1 point from Jacobian into affine coordinates (z=1) -void E1_to_affine(E1* res, const E1* p) { - // optimize in case coordinates are already affine - if (vec_is_equal(p->z, BLS12_381_pR, Fp_BYTES)) { - E1_copy(res, p); - return; - } - // convert from Jacobian - POINTonE1_from_Jacobian((POINTonE1*)res, (const POINTonE1*)p); +void E1_to_affine(E1 *res, const E1 *p) { + // optimize in case coordinates are already affine + if (vec_is_equal(p->z, BLS12_381_pR, Fp_BYTES)) { + E1_copy(res, p); + return; + } + // convert from Jacobian + POINTonE1_from_Jacobian((POINTonE1 *)res, (const POINTonE1 *)p); } // checks affine point `p` is in E1 -bool E1_affine_on_curve(const E1* p) { - // BLST's `POINTonE1_affine_on_curve` does not include the inifity case! - return POINTonE1_affine_on_curve((POINTonE1_affine*)p) | E1_is_infty(p); +bool E1_affine_on_curve(const E1 *p) { + // BLST's `POINTonE1_affine_on_curve` does not include the inifity case! + return POINTonE1_affine_on_curve((POINTonE1_affine *)p) | E1_is_infty(p); } // checks if input E1 point is on the subgroup G1. // It assumes input `p` is on E1. -bool E1_in_G1(const E1* p){ - // currently uses Scott method - return POINTonE1_in_G1((const POINTonE1*)p); +bool E1_in_G1(const E1 *p) { + // currently uses Scott method + return POINTonE1_in_G1((const POINTonE1 *)p); } -// E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or uncompressed form. -// The resulting point is guaranteed to be on curve E1 (no G1 check is included). -// Expected serialization follows: +// E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or +// uncompressed form. The resulting point is guaranteed to be on curve E1 (no G1 +// check is included). Expected serialization follows: // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) // // returns: -// - BAD_ENCODING if the length is invalid or serialization header bits are invalid +// - BAD_ENCODING if the length is invalid or serialization header bits are +// invalid // - BAD_VALUE if Fp coordinates couldn't deserialize // - POINT_NOT_ON_CURVE if deserialized point isn't on E1 -// - VALID if deserialization is valid +// - VALID if deserialization is valid -// TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, +// TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, // and update logic with G2 subgroup check? -ERROR E1_read_bytes(E1* a, const byte *bin, const int len) { - // check the length - if (len != G1_SER_BYTES) { - return BAD_ENCODING; +ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { + // check the length + if (len != G1_SER_BYTES) { + return BAD_ENCODING; + } + + // check the compression bit + int compressed = bin[0] >> 7; + if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { + return BAD_ENCODING; + } + + // check if the point in infinity + int is_infinity = bin[0] & 0x40; + if (is_infinity) { + // the remaining bits need to be cleared + if (bin[0] & 0x3F) { + return BAD_ENCODING; } - - // check the compression bit - int compressed = bin[0] >> 7; - if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { - return BAD_ENCODING; - } - - // check if the point in infinity - int is_infinity = bin[0] & 0x40; - if (is_infinity) { - // the remaining bits need to be cleared - if (bin[0] & 0x3F) { - return BAD_ENCODING; - } - for (int i=1; i> 5) & 1; - if (y_sign && (!compressed)) { + for (int i = 1; i < G1_SER_BYTES - 1; i++) { + if (bin[i]) { return BAD_ENCODING; - } - - // use a temporary buffer to mask the header bits and read a.x - byte temp[Fp_BYTES]; - memcpy(temp, bin, Fp_BYTES); - temp[0] &= 0x1F; // clear the header bits - ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp)); - if (ret != VALID) { - return ret; - } - Fp_to_montg(&a->x, &a->x); - - // set a.z to 1 - Fp_copy(&a->z, &BLS12_381_pR); - - if (G1_SERIALIZATION == UNCOMPRESSED) { - ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y)); - if (ret != VALID){ - return ret; - } - Fp_to_montg(&a->y, &a->y); - // check read point is on curve - if (!E1_affine_on_curve(a)) { - return POINT_NOT_ON_CURVE; - } - return VALID; + } } - - // compute the possible square root - Fp_squ_montg(&a->y, &a->x); - Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 - Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in Montg form - if (!Fp_sqrt_montg(&a->y, &a->y)) { // check whether x^3+b is a quadratic residue - return POINT_NOT_ON_CURVE; + E1_set_infty(a); + return VALID; + } + + // read the sign bit and check for consistency + int y_sign = (bin[0] >> 5) & 1; + if (y_sign && (!compressed)) { + return BAD_ENCODING; + } + + // use a temporary buffer to mask the header bits and read a.x + byte temp[Fp_BYTES]; + memcpy(temp, bin, Fp_BYTES); + temp[0] &= 0x1F; // clear the header bits + ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp)); + if (ret != VALID) { + return ret; + } + Fp_to_montg(&a->x, &a->x); + + // set a.z to 1 + Fp_copy(&a->z, &BLS12_381_pR); + + if (G1_SERIALIZATION == UNCOMPRESSED) { + ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y)); + if (ret != VALID) { + return ret; } - - // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) - if (Fp_get_sign(&a->y) != y_sign) { - Fp_neg(&a->y, &a->y); // flip y sign if needed + Fp_to_montg(&a->y, &a->y); + // check read point is on curve + if (!E1_affine_on_curve(a)) { + return POINT_NOT_ON_CURVE; } return VALID; -} - -// E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or uncompressed form. -// It assumes buffer is of length G1_SER_BYTES -// The serialization follows: + } + + // compute the possible square root + Fp_squ_montg(&a->y, &a->x); + Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 + Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in Montg form + if (!Fp_sqrt_montg(&a->y, + &a->y)) { // check whether x^3+b is a quadratic residue + return POINT_NOT_ON_CURVE; + } + + // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) + if (Fp_get_sign(&a->y) != y_sign) { + Fp_neg(&a->y, &a->y); // flip y sign if needed + } + return VALID; +} + +// E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or +// uncompressed form. It assumes buffer is of length G1_SER_BYTES The +// serialization follows: // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -void E1_write_bytes(byte *bin, const E1* a) { - if (E1_is_infty(a)) { - // set the infinity bit - bin[0] = (G1_SERIALIZATION << 7) | (1 << 6); - memset(bin+1, 0, G1_SER_BYTES-1); - return; - } - E1 tmp; - E1_to_affine(&tmp, a); - - Fp_from_montg(&tmp.x, &tmp.x); - Fp_write_bytes(bin, &tmp.x); - - if (G1_SERIALIZATION == COMPRESSED) { - bin[0] |= (Fp_get_sign(&tmp.y) << 5); - } else { - Fp_from_montg(&tmp.y, &tmp.y); - Fp_write_bytes(bin + Fp_BYTES, &tmp.y); - } - // compression bit - bin[0] |= (G1_SERIALIZATION << 7); +void E1_write_bytes(byte *bin, const E1 *a) { + if (E1_is_infty(a)) { + // set the infinity bit + bin[0] = (G1_SERIALIZATION << 7) | (1 << 6); + memset(bin + 1, 0, G1_SER_BYTES - 1); + return; + } + E1 tmp; + E1_to_affine(&tmp, a); + + Fp_from_montg(&tmp.x, &tmp.x); + Fp_write_bytes(bin, &tmp.x); + + if (G1_SERIALIZATION == COMPRESSED) { + bin[0] |= (Fp_get_sign(&tmp.y) << 5); + } else { + Fp_from_montg(&tmp.y, &tmp.y); + Fp_write_bytes(bin + Fp_BYTES, &tmp.y); + } + // compression bit + bin[0] |= (G1_SERIALIZATION << 7); } // generic point addition that must handle doubling and points at infinity -void E1_add(E1* res, const E1* a, const E1* b) { - POINTonE1_dadd((POINTonE1*)res, (POINTonE1*)a, (POINTonE1*)b, NULL); +void E1_add(E1 *res, const E1 *a, const E1 *b) { + POINTonE1_dadd((POINTonE1 *)res, (POINTonE1 *)a, (POINTonE1 *)b, NULL); } // Point negation: res = -a -void E1_neg(E1* res, const E1* a) { - E1_copy(res, a); - POINTonE1_cneg((POINTonE1*)res, 1); +void E1_neg(E1 *res, const E1 *a) { + E1_copy(res, a); + POINTonE1_cneg((POINTonE1 *)res, 1); } // Exponentiation of a generic point `a` in E1, res = expo.a -void E1_mult(E1* res, const E1* p, const Fr* expo) { - pow256 tmp; - pow256_from_Fr(tmp, expo); - POINTonE1_mult_glv((POINTonE1*)res, (POINTonE1*)p, tmp); - vec_zero(&tmp, sizeof(tmp)); +void E1_mult(E1 *res, const E1 *p, const Fr *expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE1_mult_glv((POINTonE1 *)res, (POINTonE1 *)p, tmp); + vec_zero(&tmp, sizeof(tmp)); } // computes the sum of the E1 array elements `y[i]` and writes it in `sum`. -void E1_sum_vector(E1* sum, const E1* y, const int len){ - E1_set_infty(sum); - for (int i=0; i= Fr_BYTES); - // map to Fr - Fr log; - map_bytes_to_Fr(&log, bytes, len); - // multiplies G1 generator by a random scalar - G1_mult_gen(p, &log); +void unsafe_map_bytes_to_G1(E1 *p, const byte *bytes, int len) { + assert(len >= Fr_BYTES); + // map to Fr + Fr log; + map_bytes_to_Fr(&log, bytes, len); + // multiplies G1 generator by a random scalar + G1_mult_gen(p, &log); } -// maps bytes to a point in E1\G1. +// maps bytes to a point in E1\G1. // `len` must be at least 96 bytes. // this is a testing file only, should not be used in any protocol! -void unsafe_map_bytes_to_G1complement(E1* p, const byte* bytes, int len) { - assert(len >= 96); - Fp u; - map_96_bytes_to_Fp(&u, bytes, 96); - // map to E1's isogenous and then to E1 - map_to_isogenous_E1((POINTonE1 *)p, u); - isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p); - // clear G1 order - E1_mult(p, p, (Fr*)&BLS12_381_r); +void unsafe_map_bytes_to_G1complement(E1 *p, const byte *bytes, int len) { + assert(len >= 96); + Fp u; + map_96_bytes_to_Fp(&u, bytes, 96); + // map to E1's isogenous and then to E1 + map_to_isogenous_E1((POINTonE1 *)p, u); + isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p); + // clear G1 order + E1_mult(p, p, (Fr *)&BLS12_381_r); } // ------------------- E2 utilities -const E2* BLS12_381_g2 = (const E2*)&BLS12_381_G2; -const E2* BLS12_381_minus_g2 = (const E2*)&BLS12_381_NEG_G2; +const E2 *BLS12_381_g2 = (const E2 *)&BLS12_381_G2; +const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2; -// E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or uncompressed form. -// The resulting point is guaranteed to be on curve E2 (no G2 check is included). +// E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or +// uncompressed form. The resulting point is guaranteed to be on curve E2 (no G2 +// check is included). // // returns: -// - BAD_ENCODING if the length is invalid or serialization header bits are invalid +// - BAD_ENCODING if the length is invalid or serialization header bits are +// invalid // - BAD_VALUE if Fp^2 coordinates couldn't deserialize // - POINT_NOT_ON_CURVE if deserialized point isn't on E2 -// - VALID if deserialization is valid +// - VALID if deserialization is valid -// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, +// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, // and update logic with G2 subgroup check? -ERROR E2_read_bytes(E2* a, const byte *bin, const int len) { - // check the length - if (len != G2_SER_BYTES) { - return BAD_ENCODING; +ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) { + // check the length + if (len != G2_SER_BYTES) { + return BAD_ENCODING; + } + + // check the compression bit + int compressed = bin[0] >> 7; + if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { + return BAD_ENCODING; + } + + // check if the point in infinity + int is_infinity = bin[0] & 0x40; + if (is_infinity) { + // the remaining bits need to be cleared + if (bin[0] & 0x3F) { + return BAD_ENCODING; } - - // check the compression bit - int compressed = bin[0] >> 7; - if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { - return BAD_ENCODING; - } - - // check if the point in infinity - int is_infinity = bin[0] & 0x40; - if (is_infinity) { - // the remaining bits need to be cleared - if (bin[0] & 0x3F) { - return BAD_ENCODING; - } - for (int i=1; i> 5) & 1; - if (y_sign && (!compressed)) { + for (int i = 1; i < G2_SER_BYTES - 1; i++) { + if (bin[i]) { return BAD_ENCODING; - } - - // use a temporary buffer to mask the header bits and read a.x - byte temp[Fp2_BYTES]; - memcpy(temp, bin, Fp2_BYTES); - temp[0] &= 0x1F; // clear the header bits - ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp)); - if (ret != VALID) { - return ret; + } } - Fp2* a_x = &(a->x); - Fp_to_montg(&real(a_x), &real(a_x)); - Fp_to_montg(&imag(a_x), &imag(a_x)); - - // set a.z to 1 - Fp2* a_z = &(a->z); - Fp_copy(&real(a_z), &BLS12_381_pR); - Fp_set_zero(&imag(a_z)); - - Fp2* a_y = &(a->y); - if (G2_SERIALIZATION == UNCOMPRESSED) { - ret = Fp2_read_bytes(a_y, bin + Fp2_BYTES, sizeof(a->y)); - if (ret != VALID){ - return ret; - } - Fp_to_montg(&real(a_y), &real(a_y)); - Fp_to_montg(&imag(a_y), &imag(a_y)); - // check read point is on curve - if (!E2_affine_on_curve(a)) { - return POINT_NOT_ON_CURVE; - } - return VALID; + E2_set_infty(a); + return VALID; + } + + // read the sign bit and check for consistency + int y_sign = (bin[0] >> 5) & 1; + if (y_sign && (!compressed)) { + return BAD_ENCODING; + } + + // use a temporary buffer to mask the header bits and read a.x + byte temp[Fp2_BYTES]; + memcpy(temp, bin, Fp2_BYTES); + temp[0] &= 0x1F; // clear the header bits + ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp)); + if (ret != VALID) { + return ret; + } + Fp2 *a_x = &(a->x); + Fp_to_montg(&real(a_x), &real(a_x)); + Fp_to_montg(&imag(a_x), &imag(a_x)); + + // set a.z to 1 + Fp2 *a_z = &(a->z); + Fp_copy(&real(a_z), &BLS12_381_pR); + Fp_set_zero(&imag(a_z)); + + Fp2 *a_y = &(a->y); + if (G2_SERIALIZATION == UNCOMPRESSED) { + ret = Fp2_read_bytes(a_y, bin + Fp2_BYTES, sizeof(a->y)); + if (ret != VALID) { + return ret; } - - // compute the possible square root - Fp2_squ_montg(a_y, a_x); - Fp2_mul_montg(a_y, a_y, a_x); // x^3 - Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form - if (!Fp2_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue - return POINT_NOT_ON_CURVE; - - // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) - if (Fp2_get_sign(a_y) != y_sign) { - Fp2_neg(a_y, a_y); // flip y sign if needed + Fp_to_montg(&real(a_y), &real(a_y)); + Fp_to_montg(&imag(a_y), &imag(a_y)); + // check read point is on curve + if (!E2_affine_on_curve(a)) { + return POINT_NOT_ON_CURVE; } return VALID; -} - -// E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or uncompressed form. -// It assumes buffer is of length G2_SER_BYTES -// The serialization follows: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -void E2_write_bytes(byte *bin, const E2* a) { - if (E2_is_infty(a)) { - // set the infinity bit - bin[0] = (G2_SERIALIZATION << 7) | (1 << 6); - memset(bin+1, 0, G2_SER_BYTES-1); - return; - } - E2 tmp; - E2_to_affine(&tmp, a); + } - Fp2* t_x = &(tmp.x); - Fp_from_montg(&real(t_x), &real(t_x)); - Fp_from_montg(&imag(t_x), &imag(t_x)); - Fp2_write_bytes(bin, t_x); + // compute the possible square root + Fp2_squ_montg(a_y, a_x); + Fp2_mul_montg(a_y, a_y, a_x); // x^3 + Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form + if (!Fp2_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue + return POINT_NOT_ON_CURVE; - Fp2* t_y = &(tmp.y); - if (G2_SERIALIZATION == COMPRESSED) { - bin[0] |= (Fp2_get_sign(t_y) << 5); - } else { - Fp_from_montg(&real(t_y), &real(t_y)); - Fp_from_montg(&imag(t_y), &imag(t_y)); - Fp2_write_bytes(bin + Fp2_BYTES, t_y); - } + // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) + if (Fp2_get_sign(a_y) != y_sign) { + Fp2_neg(a_y, a_y); // flip y sign if needed + } + return VALID; +} - bin[0] |= (G2_SERIALIZATION << 7); +// E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or +// uncompressed form. It assumes buffer is of length G2_SER_BYTES The +// serialization follows: +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +void E2_write_bytes(byte *bin, const E2 *a) { + if (E2_is_infty(a)) { + // set the infinity bit + bin[0] = (G2_SERIALIZATION << 7) | (1 << 6); + memset(bin + 1, 0, G2_SER_BYTES - 1); + return; + } + E2 tmp; + E2_to_affine(&tmp, a); + + Fp2 *t_x = &(tmp.x); + Fp_from_montg(&real(t_x), &real(t_x)); + Fp_from_montg(&imag(t_x), &imag(t_x)); + Fp2_write_bytes(bin, t_x); + + Fp2 *t_y = &(tmp.y); + if (G2_SERIALIZATION == COMPRESSED) { + bin[0] |= (Fp2_get_sign(t_y) << 5); + } else { + Fp_from_montg(&real(t_y), &real(t_y)); + Fp_from_montg(&imag(t_y), &imag(t_y)); + Fp2_write_bytes(bin + Fp2_BYTES, t_y); + } + + bin[0] |= (G2_SERIALIZATION << 7); } // set p to infinity -void E2_set_infty(E2* p) { - // BLST infinity points are defined by Z=0 - vec_zero(p->z, sizeof(p->z)); +void E2_set_infty(E2 *p) { + // BLST infinity points are defined by Z=0 + vec_zero(p->z, sizeof(p->z)); } // check if `p` is infinity -bool E2_is_infty(const E2* p) { - // BLST infinity points are defined by Z=0 - return vec_is_zero(p->z, sizeof(p->z)); +bool E2_is_infty(const E2 *p) { + // BLST infinity points are defined by Z=0 + return vec_is_zero(p->z, sizeof(p->z)); } // checks affine point `p` is in E2 -bool E2_affine_on_curve(const E2* p) { - // BLST's `POINTonE2_affine_on_curve` does not include the infinity case! - return POINTonE2_affine_on_curve((POINTonE2_affine*)p) | E2_is_infty(p); +bool E2_affine_on_curve(const E2 *p) { + // BLST's `POINTonE2_affine_on_curve` does not include the infinity case! + return POINTonE2_affine_on_curve((POINTonE2_affine *)p) | E2_is_infty(p); } // checks p1 == p2 -bool E2_is_equal(const E2* p1, const E2* p2) { - // `POINTonE2_is_equal` includes the infinity case - return POINTonE2_is_equal((const POINTonE2*)p1, (const POINTonE2*)p2); +bool E2_is_equal(const E2 *p1, const E2 *p2) { + // `POINTonE2_is_equal` includes the infinity case + return POINTonE2_is_equal((const POINTonE2 *)p1, (const POINTonE2 *)p2); } // res = p -void E2_copy(E2* res, const E2* p) { - if ((uptr_t)p==(uptr_t)res) { - return; - } - vec_copy(res, p, sizeof(E2)); +void E2_copy(E2 *res, const E2 *p) { + if ((uptr_t)p == (uptr_t)res) { + return; + } + vec_copy(res, p, sizeof(E2)); } // converts an E2 point from Jacobian into affine coordinates (z=1) -void E2_to_affine(E2* res, const E2* p) { - // optimize in case coordinates are already affine - if (vec_is_equal(p->z, BLS12_381_Rx.p2, sizeof(p->z))) { - E2_copy(res, p); - return; - } - // convert from Jacobian - POINTonE2_from_Jacobian((POINTonE2*)res, (const POINTonE2*)p); +void E2_to_affine(E2 *res, const E2 *p) { + // optimize in case coordinates are already affine + if (vec_is_equal(p->z, BLS12_381_Rx.p2, sizeof(p->z))) { + E2_copy(res, p); + return; + } + // convert from Jacobian + POINTonE2_from_Jacobian((POINTonE2 *)res, (const POINTonE2 *)p); } // generic point addition that must handle doubling and points at infinity -void E2_add(E2* res, const E2* a, const E2* b) { - POINTonE2_dadd((POINTonE2*)res, (POINTonE2*)a, (POINTonE2*)b, NULL); +void E2_add(E2 *res, const E2 *a, const E2 *b) { + POINTonE2_dadd((POINTonE2 *)res, (POINTonE2 *)a, (POINTonE2 *)b, NULL); } // generic point double that must handle point at infinity -void E2_double(E2* res, const E2* a) { - POINTonE2_double((POINTonE2*)res, (POINTonE2*)a); +void E2_double(E2 *res, const E2 *a) { + POINTonE2_double((POINTonE2 *)res, (POINTonE2 *)a); } // Point negation: res = -a -void E2_neg(E2* res, const E2* a) { - E2_copy(res, a); - POINTonE2_cneg((POINTonE2*)res, 1); +void E2_neg(E2 *res, const E2 *a) { + E2_copy(res, a); + POINTonE2_cneg((POINTonE2 *)res, 1); } // Exponentiation of a generic point `a` in E2, res = expo.a -void E2_mult(E2* res, const E2* p, const Fr* expo) { - pow256 tmp; - pow256_from_Fr(tmp, expo); - POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)p, tmp); - vec_zero(&tmp, sizeof(tmp)); +void E2_mult(E2 *res, const E2 *p, const Fr *expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE2_mult_gls((POINTonE2 *)res, (POINTonE2 *)p, tmp); + vec_zero(&tmp, sizeof(tmp)); } // Exponentiation of a generic point `a` in E2 by a byte exponent, // using a classic double-and-add algorithm (non constant-time) -void E2_mult_small_expo(E2* res, const E2* p, const byte expo) { - // return early if expo is zero - if (expo == 0) { - E2_set_infty(res); - return; - } - // expo is non zero - - byte mask = 1<<7; - // process the most significant zero bits - while((expo & mask) == 0) { - mask >>= 1; - } - - // process the first `1` bit - E2 tmp; - E2_copy(&tmp, p); - mask >>= 1; - // scan the remaining bits - for ( ; mask != 0 ; mask >>= 1 ) { - E2_double(&tmp, &tmp); - if (expo & mask) { - E2_add(&tmp, &tmp, p); - } +void E2_mult_small_expo(E2 *res, const E2 *p, const byte expo) { + // return early if expo is zero + if (expo == 0) { + E2_set_infty(res); + return; + } + // expo is non zero + + byte mask = 1 << 7; + // process the most significant zero bits + while ((expo & mask) == 0) { + mask >>= 1; + } + + // process the first `1` bit + E2 tmp; + E2_copy(&tmp, p); + mask >>= 1; + // scan the remaining bits + for (; mask != 0; mask >>= 1) { + E2_double(&tmp, &tmp); + if (expo & mask) { + E2_add(&tmp, &tmp, p); } - E2_copy(res, &tmp); + } + E2_copy(res, &tmp); } // Exponentiation of generator g2 of G2, res = expo.g2 -void G2_mult_gen(E2* res, const Fr* expo) { - pow256 tmp; - pow256_from_Fr(tmp, expo); - POINTonE2_mult_gls((POINTonE2*)res, (POINTonE2*)BLS12_381_g2, tmp); - vec_zero(&tmp, sizeof(tmp)); +void G2_mult_gen(E2 *res, const Fr *expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE2_mult_gls((POINTonE2 *)res, (POINTonE2 *)BLS12_381_g2, tmp); + vec_zero(&tmp, sizeof(tmp)); } // checks if input E2 point is on the subgroup G2. // It assumes input `p` is on E2. -bool E2_in_G2(const E2* p){ - // currently uses Scott method - return POINTonE2_in_G2((const POINTonE2*)p); +bool E2_in_G2(const E2 *p) { + // currently uses Scott method + return POINTonE2_in_G2((const POINTonE2 *)p); } // computes the sum of the E2 array elements `y[i]` and writes it in `sum` -void E2_sum_vector(E2* sum, const E2* y, const int len){ - E2_set_infty(sum); - for (int i=0; i= Fr_BYTES); - // map to Fr - Fr log; - map_bytes_to_Fr(&log, bytes, len); - // multiplies G2 generator by a random scalar - G2_mult_gen(p, &log); +void unsafe_map_bytes_to_G2(E2 *p, const byte *bytes, int len) { + assert(len >= Fr_BYTES); + // map to Fr + Fr log; + map_bytes_to_Fr(&log, bytes, len); + // multiplies G2 generator by a random scalar + G2_mult_gen(p, &log); } // maps `bytes` to a point in E2\G2 and stores it in p. -// `len` should be at least 192. +// `len` should be at least 192. // this is a testing tool only, it should not be used in any protocol! -void unsafe_map_bytes_to_G2complement(E2* p, const byte* bytes, int len) { - assert(len >= 192); - Fp2 u; - map_96_bytes_to_Fp(&real(&u), bytes, 96); - map_96_bytes_to_Fp(&imag(&u), bytes+96, 96); - // map to E2's isogenous and then to E2 - map_to_isogenous_E2((POINTonE2 *)p, u); - isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p); - // clear G2 order - E2_mult(p, p, (Fr*)&BLS12_381_r); +void unsafe_map_bytes_to_G2complement(E2 *p, const byte *bytes, int len) { + assert(len >= 192); + Fp2 u; + map_96_bytes_to_Fp(&real(&u), bytes, 96); + map_96_bytes_to_Fp(&imag(&u), bytes + 96, 96); + // map to E2's isogenous and then to E2 + map_to_isogenous_E2((POINTonE2 *)p, u); + isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p); + // clear G2 order + E2_mult(p, p, (Fr *)&BLS12_381_r); } -// ------------------- Pairing utilities +// ------------------- Pairing utilities bool Fp12_is_one(Fp12 *a) { - return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12)); + return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12)); } -void Fp12_set_one(Fp12 *a) { - vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12)); -} +void Fp12_set_one(Fp12 *a) { vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12)); } -// computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) +// computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) // by optimizing a common final exponentiation for all pairings. // result is stored in `res`. -// It assumes `p` and `q` are correctly initialized and all +// It assumes `p` and `q` are correctly initialized and all // p[i] and q[i] are respectively on G1 and G2 (it does not // check their memberships). -void Fp12_multi_pairing(Fp12* res, const E1 *p, const E2 *q, const int len) { - // easier access pointer - vec384fp6* res_vec = (vec384fp6*)res; - // N_MAX is defined within BLST. It should represent a good tradeoff of the max number - // of miller loops to be batched in one call to `miller_loop_n`. - // miller_loop_n expects an array of `POINTonEx_affine`. - POINTonE1_affine p_aff[N_MAX]; - POINTonE2_affine q_aff[N_MAX]; - int n = 0; // the number of couples (p,q) held in p_aff and q_aff - int init_flag = 0; - - for (int i=0; i 0) { - if (!init_flag) { - miller_loop_n(res_vec, q_aff, p_aff, n); - init_flag = 1; - } else { - vec384fp12 tmp; - miller_loop_n(tmp, q_aff, p_aff, n); - mul_fp12(res_vec, res_vec, tmp); - } - } - - // check if no miller loop was computed + } + // if p_ and q_ aren't empty, + // remaining couples are also batched in `n` miller loops + if (n > 0) { if (!init_flag) { - Fp12_set_one(res); + miller_loop_n(res_vec, q_aff, p_aff, n); + init_flag = 1; + } else { + vec384fp12 tmp; + miller_loop_n(tmp, q_aff, p_aff, n); + mul_fp12(res_vec, res_vec, tmp); } - final_exp(res_vec, res_vec); + } + + // check if no miller loop was computed + if (!init_flag) { + Fp12_set_one(res); + } + final_exp(res_vec, res_vec); } // This is a testing function and is not used in exported functions // It uses an expand message XMD based on SHA2-256. -void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, int len_dst){ - expand_message_xmd(hash, len_hash, NULL, 0, msg, len_msg, dst, len_dst); +void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, + int len_dst) { + expand_message_xmd(hash, len_hash, NULL, 0, msg, len_msg, dst, len_dst); } - -// DEBUG printing functions +// DEBUG printing functions #if (DEBUG == 1) -void bytes_print_(char* s, byte* data, int len) { - if (strlen(s)) printf("[%s]:\n", s); - for (int i=0; i -#include #include "blst_include.h" +#include +#include typedef uint8_t byte; -typedef _Bool bool; // assuming cgo is using a modern enough compiler +typedef _Bool bool; // assuming cgo is using a modern enough compiler // minimum targeted security level -#define SEC_BITS 128 +#define SEC_BITS 128 typedef enum { - VALID = 0, - INVALID, - BAD_ENCODING, - BAD_VALUE, - POINT_NOT_ON_CURVE, - POINT_NOT_IN_GROUP, - UNDEFINED, + VALID = 0, + INVALID, + BAD_ENCODING, + BAD_VALUE, + POINT_NOT_ON_CURVE, + POINT_NOT_IN_GROUP, + UNDEFINED, } ERROR; -#define BITS_TO_BYTES(x) ((x+7)>>3) -#define BITS_TO_LIMBS(x) ((x+63)>>6) -#define BYTES_TO_LIMBS(x) ((x+7)>>3) -#define LIMBS_TO_BYTES(x) ((x)<<3) -#define MIN(a,b) ((a)>(b)?(b):(a)) +#define BITS_TO_BYTES(x) ((x + 7) >> 3) +#define BITS_TO_LIMBS(x) ((x + 63) >> 6) +#define BYTES_TO_LIMBS(x) ((x + 7) >> 3) +#define LIMBS_TO_BYTES(x) ((x) << 3) +#define MIN(a, b) ((a) > (b) ? (b) : (a)) // Fields and Group serialization lengths -#define Fp_BITS 381 -#define Fp2_BYTES (2*Fp_BYTES) -#define Fp_LIMBS BITS_TO_LIMBS(Fp_BITS) -#define Fp_BYTES LIMBS_TO_BYTES(Fp_LIMBS) // BLST implements Fp as a limb array -#define Fr_BITS 255 -#define Fr_LIMBS BITS_TO_LIMBS(Fr_BITS) -#define Fr_BYTES LIMBS_TO_BYTES(Fr_LIMBS) // BLST implements Fr as a limb array +#define Fp_BITS 381 +#define Fp2_BYTES (2 * Fp_BYTES) +#define Fp_LIMBS BITS_TO_LIMBS(Fp_BITS) +#define Fp_BYTES LIMBS_TO_BYTES(Fp_LIMBS) // BLST implements Fp as a limb array +#define Fr_BITS 255 +#define Fr_LIMBS BITS_TO_LIMBS(Fr_BITS) +#define Fr_BYTES LIMBS_TO_BYTES(Fr_LIMBS) // BLST implements Fr as a limb array -#define G1_BYTES (2*Fp_BYTES) -#define G2_BYTES (2*Fp2_BYTES) +#define G1_BYTES (2 * Fp_BYTES) +#define G2_BYTES (2 * Fp2_BYTES) // Compressed and uncompressed points -#define COMPRESSED 1 -#define UNCOMPRESSED 0 -#define G1_SERIALIZATION (COMPRESSED) -#define G2_SERIALIZATION (COMPRESSED) -#define G1_SER_BYTES (G1_BYTES/(G1_SERIALIZATION+1)) -#define G2_SER_BYTES (G2_BYTES/(G2_SERIALIZATION+1)) +#define COMPRESSED 1 +#define UNCOMPRESSED 0 +#define G1_SERIALIZATION (COMPRESSED) +#define G2_SERIALIZATION (COMPRESSED) +#define G1_SER_BYTES (G1_BYTES / (G1_SERIALIZATION + 1)) +#define G2_SER_BYTES (G2_BYTES / (G2_SERIALIZATION + 1)) // Fr utilities extern const Fr BLS12_381_rR; -bool Fr_is_zero(const Fr* a); -bool Fr_is_equal(const Fr* a, const Fr* b); -void Fr_set_limb(Fr*, const limb_t); -void Fr_copy(Fr*, const Fr*); -void Fr_set_zero(Fr*); -void Fr_add(Fr *res, const Fr *a, const Fr *b); -void Fr_sub(Fr *res, const Fr *a, const Fr *b); -void Fr_neg(Fr *res, const Fr *a); -void Fr_sum_vector(Fr*, const Fr x[], const int); -void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b); -void Fr_squ_montg(Fr *res, const Fr *a); -void Fr_to_montg(Fr *res, const Fr *a); -void Fr_from_montg(Fr *res, const Fr *a); -void Fr_exp_montg(Fr *res, const Fr* base, const limb_t* expo, const int expo_len); -void Fr_inv_montg_eucl(Fr *res, const Fr *a); -void Fr_inv_exp_montg(Fr *res, const Fr *a); -ERROR Fr_read_bytes(Fr* a, const byte *bin, int len); -ERROR Fr_star_read_bytes(Fr* a, const byte *bin, int len); -void Fr_write_bytes(byte *bin, const Fr* a); -bool map_bytes_to_Fr(Fr*, const byte*, int); +bool Fr_is_zero(const Fr *a); +bool Fr_is_equal(const Fr *a, const Fr *b); +void Fr_set_limb(Fr *, const limb_t); +void Fr_copy(Fr *, const Fr *); +void Fr_set_zero(Fr *); +void Fr_add(Fr *res, const Fr *a, const Fr *b); +void Fr_sub(Fr *res, const Fr *a, const Fr *b); +void Fr_neg(Fr *res, const Fr *a); +void Fr_sum_vector(Fr *, const Fr x[], const int); +void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b); +void Fr_squ_montg(Fr *res, const Fr *a); +void Fr_to_montg(Fr *res, const Fr *a); +void Fr_from_montg(Fr *res, const Fr *a); +void Fr_exp_montg(Fr *res, const Fr *base, const limb_t *expo, + const int expo_len); +void Fr_inv_montg_eucl(Fr *res, const Fr *a); +void Fr_inv_exp_montg(Fr *res, const Fr *a); +ERROR Fr_read_bytes(Fr *a, const byte *bin, int len); +ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len); +void Fr_write_bytes(byte *bin, const Fr *a); +bool map_bytes_to_Fr(Fr *, const byte *, int); // Fp utilities -void Fp_mul_montg(Fp *, const Fp *, const Fp *); -void Fp_squ_montg(Fp *, const Fp *); +void Fp_mul_montg(Fp *, const Fp *, const Fp *); +void Fp_squ_montg(Fp *, const Fp *); // E1 and G1 utilities -void E1_copy(E1*, const E1*); -bool E1_is_equal(const E1*, const E1*); -void E1_set_infty(E1*); -bool E1_is_infty(const E1*); -void E1_to_affine(E1*, const E1*); -bool E1_affine_on_curve(const E1*); -bool E1_in_G1(const E1*); -void E1_mult(E1*, const E1*, const Fr*); -void E1_add(E1*, const E1*, const E1*); -void E1_neg(E1*, const E1*); -void E1_sum_vector(E1*, const E1*, const int); -int E1_sum_vector_byte(byte*, const byte*, const int); -void G1_mult_gen(E1*, const Fr*); -ERROR E1_read_bytes(E1*, const byte *, const int); -void E1_write_bytes(byte *, const E1*); -void unsafe_map_bytes_to_G1(E1*, const byte*, int); -void unsafe_map_bytes_to_G1complement(E1*, const byte*, int); - -#define MAP_TO_G1_INPUT_LEN (2*(Fp_BYTES + SEC_BITS/8)) -int map_to_G1(E1*, const byte*, const int); // functions in bls12381_hashtocurve.c +void E1_copy(E1 *, const E1 *); +bool E1_is_equal(const E1 *, const E1 *); +void E1_set_infty(E1 *); +bool E1_is_infty(const E1 *); +void E1_to_affine(E1 *, const E1 *); +bool E1_affine_on_curve(const E1 *); +bool E1_in_G1(const E1 *); +void E1_mult(E1 *, const E1 *, const Fr *); +void E1_add(E1 *, const E1 *, const E1 *); +void E1_neg(E1 *, const E1 *); +void E1_sum_vector(E1 *, const E1 *, const int); +int E1_sum_vector_byte(byte *, const byte *, const int); +void G1_mult_gen(E1 *, const Fr *); +ERROR E1_read_bytes(E1 *, const byte *, const int); +void E1_write_bytes(byte *, const E1 *); +void unsafe_map_bytes_to_G1(E1 *, const byte *, int); +void unsafe_map_bytes_to_G1complement(E1 *, const byte *, int); + +#define MAP_TO_G1_INPUT_LEN (2 * (Fp_BYTES + SEC_BITS / 8)) +int map_to_G1(E1 *, const byte *, + const int); // functions in bls12381_hashtocurve.c // E2 and G2 utilities -void E2_set_infty(E2* p); -bool E2_is_infty(const E2*); -bool E2_affine_on_curve(const E2*); -bool E2_is_equal(const E2*, const E2*); -void E2_copy(E2*, const E2*); -void E2_to_affine(E2*, const E2*); -ERROR E2_read_bytes(E2*, const byte *, const int); -void E2_write_bytes(byte *, const E2*); -void G2_mult_gen(E2*, const Fr*); -void E2_mult(E2*, const E2*, const Fr*); -void E2_mult_small_expo(E2*, const E2*, const byte); -void E2_add(E2* res, const E2* a, const E2* b); -void E2_double(E2* res, const E2* a); -void E2_neg(E2*, const E2*); -void E2_sum_vector(E2*, const E2*, const int); -void E2_subtract_vector(E2* res, const E2* x, const E2* y, const int len); -bool E2_in_G2(const E2*); -void unsafe_map_bytes_to_G2(E2*, const byte*, int); -void unsafe_map_bytes_to_G2complement(E2*, const byte*, int); +void E2_set_infty(E2 *p); +bool E2_is_infty(const E2 *); +bool E2_affine_on_curve(const E2 *); +bool E2_is_equal(const E2 *, const E2 *); +void E2_copy(E2 *, const E2 *); +void E2_to_affine(E2 *, const E2 *); +ERROR E2_read_bytes(E2 *, const byte *, const int); +void E2_write_bytes(byte *, const E2 *); +void G2_mult_gen(E2 *, const Fr *); +void E2_mult(E2 *, const E2 *, const Fr *); +void E2_mult_small_expo(E2 *, const E2 *, const byte); +void E2_add(E2 *res, const E2 *a, const E2 *b); +void E2_double(E2 *res, const E2 *a); +void E2_neg(E2 *, const E2 *); +void E2_sum_vector(E2 *, const E2 *, const int); +void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len); +bool E2_in_G2(const E2 *); +void unsafe_map_bytes_to_G2(E2 *, const byte *, int); +void unsafe_map_bytes_to_G2complement(E2 *, const byte *, int); // pairing and Fp12 -bool Fp12_is_one(Fp12*); -void Fp12_set_one(Fp12*); -void Fp12_multi_pairing(Fp12*, const E1*, const E2*, const int); +bool Fp12_is_one(Fp12 *); +void Fp12_set_one(Fp12 *); +void Fp12_multi_pairing(Fp12 *, const E1 *, const E2 *, const int); // utility testing function void xmd_sha256(byte *, int, byte *, int, byte *, int); @@ -133,13 +135,13 @@ void xmd_sha256(byte *, int, byte *, int, byte *, int); #define DEBUG 1 #if (DEBUG == 1) #include -void bytes_print_(char*, byte*, int); -void Fr_print_(char*, Fr*); -void Fp_print_(char*, const Fp*); -void Fp2_print_(char*, const Fp2*); -void Fp12_print_(char*, const Fp12*); -void E1_print_(char*, const E1*, const int); -void E2_print_(char*, const E2*, const int); +void bytes_print_(char *, byte *, int); +void Fr_print_(char *, Fr *); +void Fp_print_(char *, const Fp *); +void Fp2_print_(char *, const Fp2 *); +void Fp12_print_(char *, const Fp12 *); +void E1_print_(char *, const E1 *, const int); +void E2_print_(char *, const E2 *, const int); #endif /* DEBUG */ #endif /* BLS12_381_UTILS */ \ No newline at end of file diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 39b9e243fd1..942002de747 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -6,461 +6,498 @@ // Computes a BLS signature from a G1 point and writes it in `out`. // `out` must be allocated properly with `G1_SER_BYTES` bytes. -static void bls_sign_E1(byte* out, const Fr* sk, const E1* h) { - // s = h^s - E1 s; - E1_mult(&s, h, sk); - E1_write_bytes(out, &s); +static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) { + // s = h^s + E1 s; + E1_mult(&s, h, sk); + E1_write_bytes(out, &s); } // Computes a BLS signature from a hash and writes it in `out`. -// `hash` represents the hashed message with length `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. -// `out` must be allocated properly with `G1_SER_BYTES` bytes. -int bls_sign(byte* out, const Fr* sk, const byte* hash, const int hash_len) { - // hash to G1 - E1 h; - if (map_to_G1(&h, hash, hash_len) != VALID) { - return INVALID; - } - // s = h^sk - bls_sign_E1(out, sk, &h); - return VALID; +// `hash` represents the hashed message with length `hash_len` equal to +// `MAP_TO_G1_INPUT_LEN`. `out` must be allocated properly with `G1_SER_BYTES` +// bytes. +int bls_sign(byte *out, const Fr *sk, const byte *hash, const int hash_len) { + // hash to G1 + E1 h; + if (map_to_G1(&h, hash, hash_len) != VALID) { + return INVALID; + } + // s = h^sk + bls_sign_E1(out, sk, &h); + return VALID; } -extern const E2* BLS12_381_minus_g2; +extern const E2 *BLS12_381_minus_g2; // Verifies a BLS signature (G1 point) against a public key (G2 point) // and a message hash `h` (G1 point). -// Hash, signature and public key are assumed to be in G1, G1 and G2 respectively. This -// function only checks the pairing equality. -static int bls_verify_E1(const E2* pk, const E1* s, const E1* h) { - E1 elemsG1[2]; - E2 elemsG2[2]; - - // elemsG1[0] = s, elemsG1[1] = h - E1_copy(&elemsG1[0], s); - E1_copy(&elemsG1[1], h); - - // elemsG2[0] = -g2, elemsG2[1] = pk - E2_copy(&elemsG2[0], BLS12_381_minus_g2); - E2_copy(&elemsG2[1], pk); - - // double pairing - Fp12 e; - Fp12_multi_pairing(&e, elemsG1, elemsG2, 2); - if (Fp12_is_one(&e)) { - return VALID; - } - return INVALID; +// Hash, signature and public key are assumed to be in G1, G1 and G2 +// respectively. This function only checks the pairing equality. +static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) { + E1 elemsG1[2]; + E2 elemsG2[2]; + + // elemsG1[0] = s, elemsG1[1] = h + E1_copy(&elemsG1[0], s); + E1_copy(&elemsG1[1], h); + + // elemsG2[0] = -g2, elemsG2[1] = pk + E2_copy(&elemsG2[0], BLS12_381_minus_g2); + E2_copy(&elemsG2[1], pk); + + // double pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1, elemsG2, 2); + if (Fp12_is_one(&e)) { + return VALID; + } + return INVALID; } - // Verifies the validity of an aggregated BLS signature under distinct messages. // -// Each message is mapped to a set of public keys, so that the verification equation is -// optimized to compute one pairing per message. +// Each message is mapped to a set of public keys, so that the verification +// equation is optimized to compute one pairing per message. // - sig is the signature. // - nb_hashes is the number of the messages (hashes) in the map -// - hashes is pointer to all flattened hashes in order where the hash at index i has a byte length len_hashes[i], -// is mapped to pks_per_hash[i] public keys. +// - hashes is pointer to all flattened hashes in order where the hash at index +// i has a byte length len_hashes[i], +// is mapped to pks_per_hash[i] public keys. // - the keys are flattened in pks in the same hashes order. // // membership check of the signature in G1 is verified in this function // membership check of pks in G2 is not verified in this function -// the membership check is separated to allow optimizing multiple verifications using the same pks -int bls_verifyPerDistinctMessage(const byte* sig, - const int nb_hashes, const byte* hashes, const uint32_t* len_hashes, - const uint32_t* pks_per_hash, const E2* pks) { - - int ret = UNDEFINED; // return value - - E1* elemsG1 = (E1*)malloc((nb_hashes + 1) * sizeof(E1)); - if (!elemsG1) goto outG1; - E2* elemsG2 = (E2*)malloc((nb_hashes + 1) * sizeof(E2)); - if (!elemsG2) goto outG2; - - // elemsG1[0] = sig - if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { - ret = INVALID; - goto out; - } - - // check signature is in G1 - if (!E1_in_G1(&elemsG1[0])) { - ret = INVALID; - goto out; - } - - // elemsG2[0] = -g2 - E2_copy(&elemsG2[0], BLS12_381_minus_g2); - - // map all hashes to G1 - int offset = 0; - for (int i=1; i < nb_hashes+1; i++) { - // elemsG1[i] = h - // hash to G1 - map_to_G1(&elemsG1[i], &hashes[offset], len_hashes[i-1]); - offset += len_hashes[i-1]; - } - - // aggregate public keys mapping to the same hash - offset = 0; - for (int i=1; i < nb_hashes+1; i++) { - // elemsG2[i] = agg_pk[i] - E2_sum_vector(&elemsG2[i], &pks[offset] , pks_per_hash[i-1]); - offset += pks_per_hash[i-1]; - } - - // multi pairing - Fp12 e; - Fp12_multi_pairing(&e, elemsG1 , elemsG2, nb_hashes+1); - if (Fp12_is_one(&e)) { - ret = VALID; - } else { - ret = INVALID; - } +// the membership check is separated to allow optimizing multiple verifications +// using the same pks +int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes, + const byte *hashes, const uint32_t *len_hashes, + const uint32_t *pks_per_hash, const E2 *pks) { + + int ret = UNDEFINED; // return value + + E1 *elemsG1 = (E1 *)malloc((nb_hashes + 1) * sizeof(E1)); + if (!elemsG1) + goto outG1; + E2 *elemsG2 = (E2 *)malloc((nb_hashes + 1) * sizeof(E2)); + if (!elemsG2) + goto outG2; + + // elemsG1[0] = sig + if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { + ret = INVALID; + goto out; + } + + // check signature is in G1 + if (!E1_in_G1(&elemsG1[0])) { + ret = INVALID; + goto out; + } + + // elemsG2[0] = -g2 + E2_copy(&elemsG2[0], BLS12_381_minus_g2); + + // map all hashes to G1 + int offset = 0; + for (int i = 1; i < nb_hashes + 1; i++) { + // elemsG1[i] = h + // hash to G1 + map_to_G1(&elemsG1[i], &hashes[offset], len_hashes[i - 1]); + offset += len_hashes[i - 1]; + } + + // aggregate public keys mapping to the same hash + offset = 0; + for (int i = 1; i < nb_hashes + 1; i++) { + // elemsG2[i] = agg_pk[i] + E2_sum_vector(&elemsG2[i], &pks[offset], pks_per_hash[i - 1]); + offset += pks_per_hash[i - 1]; + } + + // multi pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_hashes + 1); + if (Fp12_is_one(&e)) { + ret = VALID; + } else { + ret = INVALID; + } out: - free(elemsG2); + free(elemsG2); outG2: - free(elemsG1); + free(elemsG1); outG1: - return ret; + return ret; } - -// Verifies the validity of an aggregated BLS signature under distinct public keys. +// Verifies the validity of an aggregated BLS signature under distinct public +// keys. // -// Each key is mapped to a set of messages, so that the verification equation is -// optimized to compute one pairing per public key. +// Each key is mapped to a set of messages, so that the verification equation is +// optimized to compute one pairing per public key. // - nb_pks is the number of the public keys in the map. // - pks is pointer to all pks in order where the key at index i -// is mapped to hashes_per_pk[i] hashes. +// is mapped to hashes_per_pk[i] hashes. // - the messages (hashes) are flattened in hashes in the same public key order, // each with a length in len_hashes. // // membership check of the signature in G1 is verified in this function // membership check of pks in G2 is not verified in this function -// the membership check is separated to allow optimizing multiple verifications using the same pks -int bls_verifyPerDistinctKey(const byte* sig, - const int nb_pks, const E2* pks, const uint32_t* hashes_per_pk, - const byte* hashes, const uint32_t* len_hashes){ - - int ret = UNDEFINED; // return value - - E1* elemsG1 = (E1*)malloc((nb_pks + 1) * sizeof(E1)); - if (!elemsG1) goto outG1; - E2* elemsG2 = (E2*)malloc((nb_pks + 1) * sizeof(E2)); - if (!elemsG2) goto outG2; - - // elemsG1[0] = s - if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { - ret = INVALID; - goto out; +// the membership check is separated to allow optimizing multiple verifications +// using the same pks +int bls_verifyPerDistinctKey(const byte *sig, const int nb_pks, const E2 *pks, + const uint32_t *hashes_per_pk, const byte *hashes, + const uint32_t *len_hashes) { + + int ret = UNDEFINED; // return value + + E1 *elemsG1 = (E1 *)malloc((nb_pks + 1) * sizeof(E1)); + if (!elemsG1) + goto outG1; + E2 *elemsG2 = (E2 *)malloc((nb_pks + 1) * sizeof(E2)); + if (!elemsG2) + goto outG2; + + // elemsG1[0] = s + if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { + ret = INVALID; + goto out; + } + + // check s in G1 + if (!E1_in_G1(&elemsG1[0])) { + ret = INVALID; + goto out; + } + + // elemsG2[0] = -g2 + E2_copy(&elemsG2[0], BLS12_381_minus_g2); + + // set the public keys + for (int i = 1; i < nb_pks + 1; i++) { + E2_copy(&elemsG2[i], &pks[i - 1]); + } + + // map all hashes to G1 and aggregate the ones with the same public key + + // tmp_hashes is a temporary array of all hashes under a same key mapped to a + // G1 point. tmp_hashes size is set to the maximum possible size to minimize + // malloc calls. + int tmp_hashes_size = hashes_per_pk[0]; + for (int i = 1; i < nb_pks; i++) { + if (hashes_per_pk[i] > tmp_hashes_size) { + tmp_hashes_size = hashes_per_pk[i]; } - - // check s in G1 - if (!E1_in_G1(&elemsG1[0])){ - ret = INVALID; - goto out; + } + E1 *tmp_hashes = (E1 *)malloc(tmp_hashes_size * sizeof(E1)); + if (!tmp_hashes) { + ret = UNDEFINED; + goto out; + } + + // sum hashes under the same key + int data_offset = 0; + int index_offset = 0; + for (int i = 1; i < nb_pks + 1; i++) { + for (int j = 0; j < hashes_per_pk[i - 1]; j++) { + // map the hash to G1 + map_to_G1(&tmp_hashes[j], &hashes[data_offset], len_hashes[index_offset]); + data_offset += len_hashes[index_offset]; + index_offset++; } + // aggregate all the points of the array + E1_sum_vector(&elemsG1[i], tmp_hashes, hashes_per_pk[i - 1]); + } + free(tmp_hashes); - // elemsG2[0] = -g2 - E2_copy(&elemsG2[0], BLS12_381_minus_g2); + // multi pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_pks + 1); - // set the public keys - for (int i=1; i < nb_pks+1; i++) { - E2_copy(&elemsG2[i], &pks[i-1]); - } - - // map all hashes to G1 and aggregate the ones with the same public key - - // tmp_hashes is a temporary array of all hashes under a same key mapped to a G1 point. - // tmp_hashes size is set to the maximum possible size to minimize malloc calls. - int tmp_hashes_size = hashes_per_pk[0]; - for (int i=1; i tmp_hashes_size) { - tmp_hashes_size = hashes_per_pk[i]; - } - } - E1* tmp_hashes = (E1*)malloc(tmp_hashes_size * sizeof(E1)); - if (!tmp_hashes) { - ret = UNDEFINED; - goto out; - } - - // sum hashes under the same key - int data_offset = 0; - int index_offset = 0; - for (int i=1; i < nb_pks+1; i++) { - for (int j=0; j < hashes_per_pk[i-1]; j++) { - // map the hash to G1 - map_to_G1(&tmp_hashes[j], &hashes[data_offset], len_hashes[index_offset]); - data_offset += len_hashes[index_offset]; - index_offset++; - } - // aggregate all the points of the array - E1_sum_vector(&elemsG1[i], tmp_hashes, hashes_per_pk[i-1]); - } - free(tmp_hashes); - - // multi pairing - Fp12 e; - Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_pks+1); - - if (Fp12_is_one(&e)) { - ret = VALID; - } else { - ret = INVALID; - } + if (Fp12_is_one(&e)) { + ret = VALID; + } else { + ret = INVALID; + } out: - free(elemsG2); + free(elemsG2); outG2: - free(elemsG1); + free(elemsG1); outG1: - return ret; + return ret; } // Verifies a BLS signature in a byte buffer. // membership check of the signature in G1 is verified. // membership check of pk in G2 is not verified in this function. -// the membership check in G2 is separated to optimize multiple verifications using the same key. -// `hash` represents the hashed message with length `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. -int bls_verify(const E2* pk, const byte* sig, const byte* hash, const int hash_len) { - E1 s, h; - // deserialize the signature into a curve point - if (E1_read_bytes(&s, sig, G1_SER_BYTES) != VALID) { - return INVALID; - } +// the membership check in G2 is separated to optimize multiple verifications +// using the same key. `hash` represents the hashed message with length +// `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. +int bls_verify(const E2 *pk, const byte *sig, const byte *hash, + const int hash_len) { + E1 s, h; + // deserialize the signature into a curve point + if (E1_read_bytes(&s, sig, G1_SER_BYTES) != VALID) { + return INVALID; + } - // check s is in G1 - if (!E1_in_G1(&s)) { - return INVALID; - } + // check s is in G1 + if (!E1_in_G1(&s)) { + return INVALID; + } - if (map_to_G1(&h, hash, hash_len) != VALID) { - return INVALID; - } - - return bls_verify_E1(pk, &s, &h); -} + if (map_to_G1(&h, hash, hash_len) != VALID) { + return INVALID; + } + return bls_verify_E1(pk, &s, &h); +} // binary tree structure to be used by bls_batch verify. -// Each node contains a signature and a public key, the signature (resp. the public key) -// being the aggregated signature of the two children's signature (resp. public keys). -// The leaves contain the initial signatures and public keys. -typedef struct st_node { - E1* sig; - E2* pk; - struct st_node* left; - struct st_node* right; +// Each node contains a signature and a public key, the signature (resp. the +// public key) being the aggregated signature of the two children's signature +// (resp. public keys). The leaves contain the initial signatures and public +// keys. +typedef struct st_node { + E1 *sig; + E2 *pk; + struct st_node *left; + struct st_node *right; } node; -static node* new_node(const E2* pk, const E1* sig){ - node* t = (node*) malloc(sizeof(node)); - if (t) { - t->pk = (E2*)pk; - t->sig = (E1*)sig; - t->right = t->left = NULL; - } - return t; +static node *new_node(const E2 *pk, const E1 *sig) { + node *t = (node *)malloc(sizeof(node)); + if (t) { + t->pk = (E2 *)pk; + t->sig = (E1 *)sig; + t->right = t->left = NULL; + } + return t; } -static void free_tree(node* root) { - if (!root) return; - - // only free pks and sigs of non-leafs, data of leafs are allocated - // as an entire array in `bls_batch_verify`. - if (root->left) { // no need to check the right child for the leaf check because - // the recursive build starts with the left side first - // pointer free - free(root->sig); - free(root->pk); - // free the children nodes - free_tree(root->left); - free_tree(root->right); - } - free(root); +static void free_tree(node *root) { + if (!root) + return; + + // only free pks and sigs of non-leafs, data of leafs are allocated + // as an entire array in `bls_batch_verify`. + if (root->left) { // no need to check the right child for the leaf check + // because + // the recursive build starts with the left side first + // pointer free + free(root->sig); + free(root->pk); + // free the children nodes + free_tree(root->left); + free_tree(root->right); + } + free(root); } -// builds a binary tree of aggregation of signatures and public keys recursively. -static node* build_tree(const int len, const E2* pks, const E1* sigs) { - // check if a leaf is reached - if (len == 1) { - return new_node(&pks[0], &sigs[0]); // use the first element of the arrays - } - - // a leaf is not reached yet, - int right_len = len/2; - int left_len = len - right_len; - - // create a new node with new points - E2* new_pk = (E2*)malloc(sizeof(E2)); - if (!new_pk) {goto error;} - E1* new_sig = (E1*)malloc(sizeof(E1)); - if (!new_sig) {goto error_sig;} - - node* t = new_node(new_pk, new_sig); - if (!t) goto error_node; - - // build the tree in a top-down way - t->left = build_tree(left_len, &pks[0], &sigs[0]); - if (!t->left) { free_tree(t); goto error; } - - t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]); - if (!t->right) { free_tree(t); goto error; } - // sum the children - E1_add(t->sig, t->left->sig, t->right->sig); - E2_add(t->pk, t->left->pk, t->right->pk); - return t; +// builds a binary tree of aggregation of signatures and public keys +// recursively. +static node *build_tree(const int len, const E2 *pks, const E1 *sigs) { + // check if a leaf is reached + if (len == 1) { + return new_node(&pks[0], &sigs[0]); // use the first element of the arrays + } + + // a leaf is not reached yet, + int right_len = len / 2; + int left_len = len - right_len; + + // create a new node with new points + E2 *new_pk = (E2 *)malloc(sizeof(E2)); + if (!new_pk) { + goto error; + } + E1 *new_sig = (E1 *)malloc(sizeof(E1)); + if (!new_sig) { + goto error_sig; + } + + node *t = new_node(new_pk, new_sig); + if (!t) + goto error_node; + + // build the tree in a top-down way + t->left = build_tree(left_len, &pks[0], &sigs[0]); + if (!t->left) { + free_tree(t); + goto error; + } + + t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]); + if (!t->right) { + free_tree(t); + goto error; + } + // sum the children + E1_add(t->sig, t->left->sig, t->right->sig); + E2_add(t->pk, t->left->pk, t->right->pk); + return t; error_node: - free(new_sig); + free(new_sig); error_sig: - free(new_pk); + free(new_pk); error: - return NULL; + return NULL; } -// verify the binary tree and fill the results using recursive batch verifications. -static void bls_batch_verify_tree(const node* root, const int len, byte* results, const E1* h) { - // verify the aggregated signature against the aggregated public key. - int res = bls_verify_E1(root->pk, root->sig, h); - - // if the result is valid, all the subtree signatures are valid. - if (res == VALID) { - for (int i=0; i < len; i++) { - if (results[i] == UNDEFINED) results[i] = VALID; // do not overwrite invalid results - } - return; +// verify the binary tree and fill the results using recursive batch +// verifications. +static void bls_batch_verify_tree(const node *root, const int len, + byte *results, const E1 *h) { + // verify the aggregated signature against the aggregated public key. + int res = bls_verify_E1(root->pk, root->sig, h); + + // if the result is valid, all the subtree signatures are valid. + if (res == VALID) { + for (int i = 0; i < len; i++) { + if (results[i] == UNDEFINED) + results[i] = VALID; // do not overwrite invalid results } - - // check if root is a leaf - if (root->left == NULL) { // no need to check the right side - *results = INVALID; - return; - } - - // otherwise, at least one of the subtree signatures is invalid. - // use the binary tree structure to find the invalid signatures. - int right_len = len/2; - int left_len = len - right_len; - bls_batch_verify_tree(root->left, left_len, &results[0], h); - bls_batch_verify_tree(root->right, right_len, &results[left_len], h); + return; + } + + // check if root is a leaf + if (root->left == NULL) { // no need to check the right side + *results = INVALID; + return; + } + + // otherwise, at least one of the subtree signatures is invalid. + // use the binary tree structure to find the invalid signatures. + int right_len = len / 2; + int left_len = len - right_len; + bls_batch_verify_tree(root->left, left_len, &results[0], h); + bls_batch_verify_tree(root->right, right_len, &results[left_len], h); } -// Batch verifies the validity of a multiple BLS signatures of the -// same message under multiple public keys. Each signature at index `i` is verified -// against the public key at index `i`. -// `seed` is used as the entropy source for randoms required by the computation. The function -// assumes the source size is at least (16*sigs_len) of random bytes of entropy at least 128 bits. +// Batch verifies the validity of a multiple BLS signatures of the +// same message under multiple public keys. Each signature at index `i` is +// verified against the public key at index `i`. `seed` is used as the entropy +// source for randoms required by the computation. The function assumes the +// source size is at least (16*sigs_len) of random bytes of entropy at least 128 +// bits. // // - membership checks of all signatures is verified upfront. -// - use random coefficients for signatures and public keys at the same index to prevent +// - use random coefficients for signatures and public keys at the same index to +// prevent // indices mixup. -// - optimize the verification by verifying an aggregated signature against an aggregated -// public key, and use a recursive verification to find invalid signatures. -void bls_batch_verify(const int sigs_len, byte* results, const E2* pks_input, - const byte* sigs_bytes, const byte* data, const int data_len, const byte* seed) { - - // initialize results to undefined - memset(results, UNDEFINED, sigs_len); - - // build the arrays of G1 and G2 elements to verify - E2* pks = (E2*) malloc(sigs_len * sizeof(E2)); - if (!pks) return; - E1* sigs = (E1*) malloc(sigs_len * sizeof(E1)); - if (!sigs) goto out_sigs; - - for (int i=0; i < sigs_len; i++) { - // convert the signature points: - // - invalid points are stored as infinity points with an invalid result, so that - // the tree aggregations remain valid. - // - valid points are multiplied by a random scalar (same for public keys at same index) - // to make sure a signature at index (i) is verified against the public key at the same index. - int read_ret = E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES*i], G1_SER_BYTES); - if (read_ret != VALID || !E1_in_G1(&sigs[i])) { - // set signature and key to infinity (no effect on the aggregation tree) - // and set result to invalid (result won't be overwritten) - E2_set_infty(&pks[i]); - E1_set_infty(&sigs[i]); - results[i] = INVALID; - } else { - // choose a random non-zero coefficient of at least 128 bits - Fr r, one; - // r = random, i-th seed is used for i-th signature - Fr_set_zero(&r); - const int seed_len = SEC_BITS/8; - limbs_from_be_bytes((limb_t*)&r, seed + (seed_len*i), seed_len); // faster shortcut than Fr_map_bytes - // r = random + 1 - Fr_set_limb(&one, 1); - Fr_add(&r, &r, &one); - // multiply public key and signature by the same random exponent r - E2_mult(&pks[i], &pks_input[i], &r); - E1_mult(&sigs[i], &sigs[i], &r); - } - } - // build a binary tree of aggreagtions - node* root = build_tree(sigs_len, &pks[0], &sigs[0]); - if (!root) goto out; - - E1 h; - if (map_to_G1(&h, data, data_len) != VALID) { - goto out; +// - optimize the verification by verifying an aggregated signature against an +// aggregated +// public key, and use a recursive verification to find invalid signatures. +void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input, + const byte *sigs_bytes, const byte *data, + const int data_len, const byte *seed) { + + // initialize results to undefined + memset(results, UNDEFINED, sigs_len); + + // build the arrays of G1 and G2 elements to verify + E2 *pks = (E2 *)malloc(sigs_len * sizeof(E2)); + if (!pks) + return; + E1 *sigs = (E1 *)malloc(sigs_len * sizeof(E1)); + if (!sigs) + goto out_sigs; + + for (int i = 0; i < sigs_len; i++) { + // convert the signature points: + // - invalid points are stored as infinity points with an invalid result, so + // that the tree aggregations remain valid. + // - valid points are multiplied by a random scalar (same for public keys at + // same index) to make sure a signature at index (i) is verified against the + // public key at the same index. + int read_ret = + E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES * i], G1_SER_BYTES); + if (read_ret != VALID || !E1_in_G1(&sigs[i])) { + // set signature and key to infinity (no effect on the aggregation tree) + // and set result to invalid (result won't be overwritten) + E2_set_infty(&pks[i]); + E1_set_infty(&sigs[i]); + results[i] = INVALID; + } else { + // choose a random non-zero coefficient of at least 128 bits + Fr r, one; + // r = random, i-th seed is used for i-th signature + Fr_set_zero(&r); + const int seed_len = SEC_BITS / 8; + limbs_from_be_bytes((limb_t *)&r, seed + (seed_len * i), + seed_len); // faster shortcut than Fr_map_bytes + // r = random + 1 + Fr_set_limb(&one, 1); + Fr_add(&r, &r, &one); + // multiply public key and signature by the same random exponent r + E2_mult(&pks[i], &pks_input[i], &r); + E1_mult(&sigs[i], &sigs[i], &r); } - - // verify the binary tree and fill the results using batch verification - bls_batch_verify_tree(root, sigs_len, &results[0], &h); - // free the allocated tree - free_tree(root); + } + // build a binary tree of aggreagtions + node *root = build_tree(sigs_len, &pks[0], &sigs[0]); + if (!root) + goto out; + + E1 h; + if (map_to_G1(&h, data, data_len) != VALID) { + goto out; + } + + // verify the binary tree and fill the results using batch verification + bls_batch_verify_tree(root, sigs_len, &results[0], &h); + // free the allocated tree + free_tree(root); out: - free(sigs); + free(sigs); out_sigs: - free(pks); + free(pks); } // Verifies the validity of 2 SPoCK proofs and 2 public keys. // Membership check in G1 of both proofs is verified in this function. // Membership check in G2 of both keys is not verified in this function. -// the membership check in G2 is separated to allow optimizing multiple verifications -// using the same public keys. -int bls_spock_verify(const E2* pk1, const byte* sig1, const E2* pk2, const byte* sig2) { - E1 elemsG1[2]; - E2 elemsG2[2]; - - // elemsG1[0] = s1 - if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != VALID) { - return INVALID; - }; - // check s1 is in G1 - if (!E1_in_G1(&elemsG1[0])) { - return INVALID; - } +// the membership check in G2 is separated to allow optimizing multiple +// verifications using the same public keys. +int bls_spock_verify(const E2 *pk1, const byte *sig1, const E2 *pk2, + const byte *sig2) { + E1 elemsG1[2]; + E2 elemsG2[2]; + + // elemsG1[0] = s1 + if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != VALID) { + return INVALID; + }; + // check s1 is in G1 + if (!E1_in_G1(&elemsG1[0])) { + return INVALID; + } - // elemsG1[1] = s2 - if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != VALID) { - return INVALID; - }; - // check s2 is in G1 - if (!E1_in_G1(&elemsG1[1])) { - return INVALID; - } + // elemsG1[1] = s2 + if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != VALID) { + return INVALID; + }; + // check s2 is in G1 + if (!E1_in_G1(&elemsG1[1])) { + return INVALID; + } - // elemsG2[1] = pk1 - E2_copy(&elemsG2[1], pk1); + // elemsG2[1] = pk1 + E2_copy(&elemsG2[1], pk1); - // elemsG2[0] = -pk2 - E2_neg(&elemsG2[0], pk2); + // elemsG2[0] = -pk2 + E2_neg(&elemsG2[0], pk2); - // double pairing - Fp12 e; - Fp12_multi_pairing(&e, elemsG1 , elemsG2, 2); + // double pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1, elemsG2, 2); - if (Fp12_is_one(&e)) { - return VALID; - } - return INVALID; + if (Fp12_is_one(&e)) { + return VALID; + } + return INVALID; } - diff --git a/crypto/bls_include.h b/crypto/bls_include.h index c5dba4d45de..af380735237 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -6,17 +6,17 @@ #include "bls12381_utils.h" // BLS signature core (functions in bls_core.c) -int bls_sign(byte*, const Fr*, const byte*, const int); -int bls_verify(const E2*, const byte*, const byte*, const int); -int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, - const uint32_t*, const E2*); -int bls_verifyPerDistinctKey(const byte*, - const int, const E2*, const uint32_t*, - const byte*, const uint32_t*); -void bls_batch_verify(const int, byte*, const E2*, - const byte*, const byte*, const int, const byte*); +int bls_sign(byte *, const Fr *, const byte *, const int); +int bls_verify(const E2 *, const byte *, const byte *, const int); +int bls_verifyPerDistinctMessage(const byte *, const int, const byte *, + const uint32_t *, const uint32_t *, + const E2 *); +int bls_verifyPerDistinctKey(const byte *, const int, const E2 *, + const uint32_t *, const byte *, const uint32_t *); +void bls_batch_verify(const int, byte *, const E2 *, const byte *, const byte *, + const int, const byte *); // BLS based SPoCK -int bls_spock_verify(const E2*, const byte*, const E2*, const byte*); +int bls_spock_verify(const E2 *, const byte *, const E2 *, const byte *); #endif diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index e951cc9c33f..dc7e1354907 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -1,109 +1,117 @@ #include "bls_thresholdsign_include.h" // the highest index of a threshold participant -#define MAX_IND 255 -#define MAX_IND_BITS 8 // equal to ceiling(log_2(MAX_IND)) +#define MAX_IND 255 +#define MAX_IND_BITS 8 // equal to ceiling(log_2(MAX_IND)) -// Computes the Lagrange coefficient L_i(0) in Fr with regards to the range [indices(0)..indices(t)] -// and stores it in `res`, where t is the degree of the polynomial P. -// `len` is equal to `t+1` where `t` is the polynomial degree. -static void Fr_lagrange_coeff_at_zero(Fr* res, const int i, const byte indices[], const int len){ +// Computes the Lagrange coefficient L_i(0) in Fr with regards to the range +// [indices(0)..indices(t)] and stores it in `res`, where t is the degree of the +// polynomial P. `len` is equal to `t+1` where `t` is the polynomial degree. +static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, + const byte indices[], const int len) { - // coefficient is computed as N * D^(-1) - Fr numerator; // eventually would represent N*R^k - Fr denominator; // eventually would represent D*R^k + // coefficient is computed as N * D^(-1) + Fr numerator; // eventually would represent N*R^k + Fr denominator; // eventually would represent D*R^k - // Initialize N and D to Montgomery constant R - Fr_copy(&numerator, &BLS12_381_rR); - Fr_copy(&denominator, &BLS12_381_rR); + // Initialize N and D to Montgomery constant R + Fr_copy(&numerator, &BLS12_381_rR); + Fr_copy(&denominator, &BLS12_381_rR); - // sign of D: 0 for positive and 1 for negative - int sign = 0; + // sign of D: 0 for positive and 1 for negative + int sign = 0; - // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately 64/MAX_IND_BITS) - // this means we can multiply up to (k) indices in a limb (64 bits) without overflowing. - #define MAX_IND_LOOPS (64/MAX_IND_BITS) - const int loops = MAX_IND_LOOPS; - int k,j = 0; - Fr tmp; - while (j= 0; i--) { - Fr_mul_montg(image, image, &xR); - Fr_add(image, image, &a[i]); // image is in normal form - } - // compute y = P(x).g2 - if (y) { - G2_mult_gen(y, image); - } + for (int i = degree; i >= 0; i--) { + Fr_mul_montg(image, image, &xR); + Fr_add(image, image, &a[i]); // image is in normal form + } + // compute y = P(x).g2 + if (y) { + G2_mult_gen(y, image); + } } // computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 // and stores the point in y -static void E2_polynomial_image(E2* y, const E2* A, const int degree, const byte x){ - E2_set_infty(y); - for (int i = degree; i >= 0 ; i--) { - E2_mult_small_expo(y, y, x); - E2_add(y, y, &A[i]); - } +static void E2_polynomial_image(E2 *y, const E2 *A, const int degree, + const byte x) { + E2_set_infty(y); + for (int i = degree; i >= 0; i--) { + E2_mult_small_expo(y, y, x); + E2_add(y, y, &A[i]); + } } - // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y) // where Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2[X] -void E2_polynomial_images(E2* y, const int len_y, const E2* A, const int degree) { - for (byte i=0; i Date: Fri, 18 Aug 2023 14:14:43 -0600 Subject: [PATCH 141/200] remove clanf-format config file --- crypto/.clang-format | 192 ------------------------------------------- 1 file changed, 192 deletions(-) delete mode 100644 crypto/.clang-format diff --git a/crypto/.clang-format b/crypto/.clang-format deleted file mode 100644 index 48b2c678323..00000000000 --- a/crypto/.clang-format +++ /dev/null @@ -1,192 +0,0 @@ ---- -Language: Cpp -# BasedOnStyle: LLVM -AccessModifierOffset: -2 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveMacros: None -AlignConsecutiveAssignments: None -AlignConsecutiveBitFields: None -AlignConsecutiveDeclarations: None -AlignEscapedNewlines: Right -AlignOperands: Align -AlignTrailingComments: true -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortEnumsOnASingleLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: All -AllowShortLambdasOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: MultiLine -AttributeMacros: - - __capability -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterCaseLabel: false - AfterClass: false - AfterControlStatement: Never - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakBeforeBinaryOperators: None -BreakBeforeConceptDeclarations: true -BreakBeforeBraces: Attach -BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -QualifierAlignment: Leave -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DeriveLineEnding: true -DerivePointerAlignment: false -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -ExperimentalAutoDetectBinPacking: false -PackConstructorInitializers: BinPack -BasedOnStyle: '' -ConstructorInitializerAllOnOneLineOrOnePerLine: false -AllowAllConstructorInitializersOnNextLine: true -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IfMacros: - - KJ_IF_MAYBE -IncludeBlocks: Preserve -IncludeCategories: - - Regex: '^"(llvm|llvm-c|clang|clang-c)/' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '^(<|"(gtest|gmock|isl|json)/)' - Priority: 3 - SortPriority: 0 - CaseSensitive: false - - Regex: '.*' - Priority: 1 - SortPriority: 0 - CaseSensitive: false -IncludeIsMainRegex: '(Test)?$' -IncludeIsMainSourceRegex: '' -IndentAccessModifiers: false -IndentCaseLabels: false -IndentCaseBlocks: false -IndentGotoLabels: true -IndentPPDirectives: None -IndentExternBlock: AfterExternBlock -IndentRequires: false -IndentWidth: 2 -IndentWrappedFunctionNames: false -InsertTrailingCommas: None -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: true -LambdaBodyIndentation: Signature -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Auto -ObjCBlockIndentWidth: 2 -ObjCBreakBeforeNestedBlockParam: true -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 60 -PenaltyIndentedWhitespace: 0 -PointerAlignment: Right -PPIndentWidth: -1 -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -SeparateDefinitionBlocks: Leave -ShortNamespaceLines: 1 -SortIncludes: CaseSensitive -SortJavaStaticImport: Before -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeParensOptions: - AfterControlStatements: true - AfterForeachMacros: true - AfterFunctionDefinitionName: false - AfterFunctionDeclarationName: false - AfterIfMacros: true - AfterOverloadedOperator: false - BeforeNonEmptyParentheses: false -SpaceAroundPointerQualifiers: Default -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyBlock: false -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 1 -SpacesInAngles: Never -SpacesInConditionalStatement: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: -1 -SpacesInParentheses: false -SpacesInSquareBrackets: false -SpaceBeforeSquareBrackets: false -BitFieldColonSpacing: Both -Standard: Latest -StatementAttributeLikeMacros: - - Q_EMIT -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -TabWidth: 8 -UseCRLF: false -UseTab: Never -WhitespaceSensitiveMacros: - - STRINGIZE - - PP_STRINGIZE - - BOOST_PP_STRINGIZE - - NS_SWIFT_NAME - - CF_SWIFT_NAME -... - From 647c0c2222263680aede664b71ee8ca0afd0b2f0 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 18 Aug 2023 14:20:29 -0600 Subject: [PATCH 142/200] add crypto code formatting check to CI --- Makefile | 2 +- crypto/Makefile | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c927ff4403a..a84d3f0d276 100644 --- a/Makefile +++ b/Makefile @@ -205,7 +205,7 @@ generate-mocks: install-mock-generators tidy: go mod tidy -v cd integration; go mod tidy -v - cd crypto; go mod tidy -v + $(MAKE) -C crypto tidy cd cmd/testclient; go mod tidy -v cd insecure; go mod tidy -v git diff --exit-code diff --git a/crypto/Makefile b/crypto/Makefile index 28e7a5f6f2f..ffde0fa6b57 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -29,7 +29,7 @@ else endif CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) -# format +# format C code .PHONY: c-format c-format: clang-format -style=llvm -dump-config > .clang-format @@ -38,6 +38,13 @@ c-format: rm -f .clang-format git diff --exit-code +# tidy Go and C code +.PHONY: tidy +tidy: c-format + go mod tidy -v + git diff --exit-code + + # test all packages .PHONY: test test: From 63cedc8ffd5e52df559a0160613447a0481b4ca1 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 21 Aug 2023 10:15:50 -0600 Subject: [PATCH 143/200] move c formatting to linter target --- .github/workflows/ci.yml | 5 +++++ Makefile | 2 +- crypto/Makefile | 15 ++++++++++++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57b0da2ace2..9bc0e30291e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,6 +38,10 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true + - name: Install C formatter + run: sudo apt-get install -y clang-format + - name: Run C formatter for ./crypto + run: make -C crypto c-format - name: Run go generate run: go generate working-directory: ${{ matrix.dir }} @@ -50,6 +54,7 @@ jobs: working-directory: ${{ matrix.dir }} # https://github.com/golangci/golangci-lint-action/issues/244 skip-cache: true + tidy: name: Tidy diff --git a/Makefile b/Makefile index a84d3f0d276..9cb7ac5fac9 100644 --- a/Makefile +++ b/Makefile @@ -205,7 +205,7 @@ generate-mocks: install-mock-generators tidy: go mod tidy -v cd integration; go mod tidy -v - $(MAKE) -C crypto tidy + cd crypo; go mod tidy -v cd cmd/testclient; go mod tidy -v cd insecure; go mod tidy -v git diff --exit-code diff --git a/crypto/Makefile b/crypto/Makefile index ffde0fa6b57..be839a18118 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -38,12 +38,21 @@ c-format: rm -f .clang-format git diff --exit-code -# tidy Go and C code -.PHONY: tidy -tidy: c-format +# Go tidy +.PHONY: go-tidy +go-tidy: go mod tidy -v git diff --exit-code +# Go lint +.PHONY: go-lint +go-lint: +lint: go-tidy + # revive -config revive.toml + golangci-lint run -v ./... + + + # test all packages .PHONY: test From a8666e4c97cede542b2e431e60a90181ff2c4882 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 21 Aug 2023 10:57:50 -0600 Subject: [PATCH 144/200] fix linter error --- crypto/bls_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/crypto/bls_test.go b/crypto/bls_test.go index 0ead9fd3100..aa1e171b216 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -660,7 +660,6 @@ func TestBLSBatchVerify(t *testing.T) { // number of signatures to aggregate sigsNum := rand.Intn(100) + 2 sigs := make([]Signature, 0, sigsNum) - sks := make([]PrivateKey, 0, sigsNum) pks := make([]PublicKey, 0, sigsNum) expectedValid := make([]bool, 0, sigsNum) @@ -670,7 +669,6 @@ func TestBLSBatchVerify(t *testing.T) { s, err := sk.Sign(input, kmac) require.NoError(t, err) sigs = append(sigs, s) - sks = append(sks, sk) pks = append(pks, sk.PublicKey()) expectedValid = append(expectedValid, true) } From 30e5a7b23aeec7c282550cb700a097e44db1a0ba Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 21 Aug 2023 20:55:45 -0600 Subject: [PATCH 145/200] delete unused fermat inversion --- crypto/bls12381_utils.c | 56 ---------------------------------------- crypto/bls12381_utils.go | 4 +-- crypto/bls12381_utils.h | 3 --- 3 files changed, 2 insertions(+), 61 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 665f3853236..7e1afbf7fbf 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -95,62 +95,6 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) { redc_mont_256((limb_t *)res, temp, BLS12_381_r, r0); } -// result is in Montgomery form if base is in montgomery form -// if base = b*R, res = b^expo * R -// In general, res = base^expo * R^(-expo+1) -// `expo` is encoded as a little-endian limb_t table of length `expo_len`. -// TODO: could be deleted -void Fr_exp_montg(Fr *res, const Fr *base, const limb_t *expo, - const int expo_len) { - // mask of the most significant bit - const limb_t msb_mask = (limb_t)1 << ((sizeof(limb_t) << 3) - 1); - limb_t mask = msb_mask; - int index = 0; - - expo += expo_len; - // process most significant zero limbs - while ((index < expo_len) && (*(--expo) == 0)) { - index++; - } - // if expo is zero - if (index == expo_len) { - Fr_copy(res, base); - return; - } - // expo is non zero - // process the most significant zero bits - while ((*expo & mask) == 0) { - mask >>= 1; - } - Fr tmp; - // process the first `1` bit - Fr_copy(&tmp, base); - mask >>= 1; - // Scan all limbs of the exponent - for (; index < expo_len; expo--) { - // Scan all bits - for (; mask != 0; mask >>= 1) { - // square - Fr_squ_montg(&tmp, &tmp); - // multiply - if (*expo & mask) { - Fr_mul_montg(&tmp, &tmp, base); - } - } - mask = msb_mask; - index++; - } - Fr_copy(res, &tmp); -} - -// TODO: could be deleted -void Fr_inv_exp_montg(Fr *res, const Fr *a) { - Fr r_2; - Fr_copy(&r_2, (Fr *)BLS12_381_r); - r_2.limbs[0] -= 2; - Fr_exp_montg(res, a, (limb_t *)&r_2, 4); -} - // computes the sum of the array elements and writes the sum in jointx void Fr_sum_vector(Fr *jointx, const Fr x[], const int len) { Fr_set_zero(jointx); diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 40580ca7239..a972ca46b64 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -16,8 +16,8 @@ package crypto // static void handler(int signum) // { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=\"-O -D__BLST_PORTABLE__\"\n"; // ssize_t n = write(2, &text, strlen(text)); -// _exit(128+SIGILL); -// (void)n; +// _exit(128+SIGILL); +// (void)n; // } // __attribute__((constructor)) static void flow_crypto_cgo_init() // { Fp temp = { 0 }; diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index b2ea2654228..1936c151497 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -66,10 +66,7 @@ void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b); void Fr_squ_montg(Fr *res, const Fr *a); void Fr_to_montg(Fr *res, const Fr *a); void Fr_from_montg(Fr *res, const Fr *a); -void Fr_exp_montg(Fr *res, const Fr *base, const limb_t *expo, - const int expo_len); void Fr_inv_montg_eucl(Fr *res, const Fr *a); -void Fr_inv_exp_montg(Fr *res, const Fr *a); ERROR Fr_read_bytes(Fr *a, const byte *bin, int len); ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len); void Fr_write_bytes(byte *bin, const Fr *a); From 8556f69ff882b80a28d8f3f57440416b8aa6f1a1 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 22 Aug 2023 11:02:57 -0600 Subject: [PATCH 146/200] update README with BLST update steps --- crypto/bls12381_utils.c | 2 +- crypto/blst_src/README.md | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 7e1afbf7fbf..b583462886b 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -22,7 +22,7 @@ const Fr BLS12_381_rR = {{ // returns true if a == 0 and false otherwise bool Fr_is_zero(const Fr *a) { - return bytes_are_zero((const byte *)a, sizeof(Fr)); + return vec_is_zero(a, sizeof(Fr)); } // returns true if a == b and false otherwise diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index 877c9db7ee5..1234169dbef 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -9,9 +9,21 @@ While BLST exports multiple functions and tools, the implementation in Flow cryp The folder contains: - BLST LICENSE file -- all /src/*.c and /src/*.h files (C source files) -- all /build (assembly generated files) -- /bindings/blst.h (headers of external functions) -- /bindings/blst_aux.h (headers of external aux functions) +- all `/src/*.c` and `/src/*.h` files (C source files) but `server.c`. +- `server.c` is replaced by `blst_src.c` (which lists only the files needed by Flow crypto). +- all `/build` (assembly generated files). +- `/bindings/blst.h` (headers of external functions). +- `/bindings/blst_aux.h` (headers of external aux functions). +- this `README` file. -TODO: add steps for upgrading the BLST version \ No newline at end of file +To upgrade the BLST version: +- [ ] delete all files in this folder but `blst_src.c` and `README.md`. +- [ ] open BLST repository on the new version. +- [ ] copy all `.c` and `.h` files from `/src/` into this folder. +- [ ] delete `server.c` from this folder. +- [ ] copy the folder `/build/` into this folder. +- [ ] copy `/bindings/blst.h` and `/bindings/blst_aux.h` into this folder. +- [ ] solve all breaking changes that may occur. +- [ ] update the commit version on this `README`. + +Remember that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should made along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. \ No newline at end of file From c1f294ce73284c49b5db4815a690a671c2e59e0b Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 24 Aug 2023 11:33:01 -0600 Subject: [PATCH 147/200] temp tmate debug and compile with asan --- .github/workflows/ci.yml | 2 ++ crypto/bls12381_utils.go | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9bc0e30291e..30f9c107bb3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -69,6 +69,8 @@ jobs: cache: true - name: Run tidy run: make tidy + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 create-dynamic-test-matrix: name: Create Dynamic Test Matrix diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index a972ca46b64..2a4e07c0d45 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -4,9 +4,10 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros +// #cgo CFLAGS: -O -D__BLST_PORTABLE__ -O0 -g -fsanitize=address -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ +// #cgo LDFLAGS: -fsanitize=address // #include "bls12381_utils.h" // // #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) From bacdb3393cfc22f81a190c089815dbfd2c057a9f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 24 Aug 2023 12:29:32 -0600 Subject: [PATCH 148/200] tmp --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30f9c107bb3..e3cda99316f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,6 +38,8 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 - name: Install C formatter run: sudo apt-get install -y clang-format - name: Run C formatter for ./crypto @@ -69,8 +71,6 @@ jobs: cache: true - name: Run tidy run: make tidy - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 create-dynamic-test-matrix: name: Create Dynamic Test Matrix From c8b643ce2f1a7dddf95359bf1b0cd52d8562cd6e Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 24 Aug 2023 15:57:56 -0600 Subject: [PATCH 149/200] add new target for sanitization --- crypto/Makefile | 13 +++++++++++++ crypto/bls12381_utils.go | 3 +-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/crypto/Makefile b/crypto/Makefile index be839a18118..2d50f0d1d75 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -38,6 +38,19 @@ c-format: rm -f .clang-format git diff --exit-code +# sanitize C code +# cannot run on macos +.PHONY: c-sanitize +c-format: +# memory sanitization + $(CGO_FLAG) CC="clang -O0 -g -fsanitize=memory -fno-omit-frame-pointer" \ + LD="-fsanitize=memory" go test \ + if [ $$? -ne 0 ]; then exit 1; fi +# address sanitization and other checks + $(CGO_FLAG) CC="clang -O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \ + LD="-fsanitize=address" go test \ + if [ $$? -ne 0 ]; then exit 1; fi + # Go tidy .PHONY: go-tidy go-tidy: diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 2a4e07c0d45..a972ca46b64 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -4,10 +4,9 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -O -D__BLST_PORTABLE__ -O0 -g -fsanitize=address -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ -// #cgo LDFLAGS: -fsanitize=address // #include "bls12381_utils.h" // // #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) From 62c1a166b326114a6ddb9829f86d75592b5e947c Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 24 Aug 2023 16:44:44 -0600 Subject: [PATCH 150/200] add sanitizer to ci job --- crypto/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/Makefile b/crypto/Makefile index 2d50f0d1d75..7855284f9bb 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -41,9 +41,9 @@ c-format: # sanitize C code # cannot run on macos .PHONY: c-sanitize -c-format: +c-sanitize: # memory sanitization - $(CGO_FLAG) CC="clang -O0 -g -fsanitize=memory -fno-omit-frame-pointer" \ + $(CGO_FLAG) CC="clang -O -D__BLST_PORTABLE__ -O0 -g -fsanitize=memory -fno-omit-frame-pointer" \ LD="-fsanitize=memory" go test \ if [ $$? -ne 0 ]; then exit 1; fi # address sanitization and other checks From e557e8beebaa5301efe0be3d0af8543096eb25a8 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 28 Aug 2023 20:06:46 -0600 Subject: [PATCH 151/200] add more sanitization flags and restrict sanitization to linux --- crypto/Makefile | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/crypto/Makefile b/crypto/Makefile index 7855284f9bb..51454b801d8 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -3,6 +3,9 @@ COVER_PROFILE := cover.out IMAGE_TAG := v0.0.7 +# OS +UNAME := $(shell uname -s) + # allows CI to specify whether to have race detection on / off ifeq ($(RACE_DETECTOR),1) RACE_FLAG := -race @@ -11,7 +14,7 @@ else endif # `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. -ifeq ($(shell uname -s),Linux) +ifeq ($(UNAME),Linux) # detect ADX support on the CURRENT linux machine. ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) else @@ -40,16 +43,21 @@ c-format: # sanitize C code # cannot run on macos -.PHONY: c-sanitize +.SILENT: c-sanitize c-sanitize: -# memory sanitization - $(CGO_FLAG) CC="clang -O -D__BLST_PORTABLE__ -O0 -g -fsanitize=memory -fno-omit-frame-pointer" \ - LD="-fsanitize=memory" go test \ - if [ $$? -ne 0 ]; then exit 1; fi -# address sanitization and other checks - $(CGO_FLAG) CC="clang -O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \ - LD="-fsanitize=address" go test \ - if [ $$? -ne 0 ]; then exit 1; fi +# - memory sanitization (only on linux and using clang) - (could use go test -msan) +# - address sanitization and other checks (only on linux) + if [ $(UNAME) = "Linux" ]; then \ + $(CGO_FLAG) CC="clang -O0 -g -fsanitize=memory -fno-omit-frame-pointer -fsanitize-memory-track-origins" \ + LD="-fsanitize=memory" go test; \ + if [ $$? -ne 0 ]; then exit 1; fi; \ + \ + $(CGO_FLAG) CC="-O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \ + LD="-fsanitize=address -fsanitize=leak" go test; \ + if [ $$? -ne 0 ]; then exit 1; fi; \ + else \ + echo "sanitization is only supported on Linux"; \ + fi; \ # Go tidy .PHONY: go-tidy From f8bc02b9d63580a0d832e4e6eb6a9be0d8f32b0f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 28 Aug 2023 20:07:22 -0600 Subject: [PATCH 152/200] add sanitization to ci --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3cda99316f..21fdbd7834e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -42,8 +42,8 @@ jobs: uses: mxschmitt/action-tmate@v3 - name: Install C formatter run: sudo apt-get install -y clang-format - - name: Run C formatter for ./crypto - run: make -C crypto c-format + - name: Run C formatter and sanitizer for ./crypto + run: make -C crypto c-format && make -C crypto c-sanitize - name: Run go generate run: go generate working-directory: ${{ matrix.dir }} From d58b172fa8710807d7682451f08734402e35ef43 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 28 Aug 2023 20:07:58 -0600 Subject: [PATCH 153/200] disable sanitization for E1_write_bytes because of false positive --- crypto/bls12381_utils.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index b583462886b..653935c197f 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -168,7 +168,7 @@ ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len) { // write Fr element `a` in big endian bytes. void Fr_write_bytes(byte *bin, const Fr *a) { - // be_bytes_from_limbs works for both limb endiannesses + // be_bytes_from_limbs works for both limb endianness types be_bytes_from_limbs(bin, (limb_t *)a, Fr_BYTES); } @@ -302,7 +302,8 @@ ERROR Fp_read_bytes(Fp *a, const byte *bin, int len) { return VALID; } -// write Fp element to bin and assume `bin` has `Fp_BYTES` allocated bytes. +// write Fp element to `bin`, +// assuming `bin` has `Fp_BYTES` allocated bytes. void Fp_write_bytes(byte *bin, const Fp *a) { be_bytes_from_limbs(bin, (limb_t *)a, Fp_BYTES); } @@ -523,8 +524,8 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { Fp_squ_montg(&a->y, &a->x); Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in Montg form - if (!Fp_sqrt_montg(&a->y, - &a->y)) { // check whether x^3+b is a quadratic residue + // check whether x^3+b is a quadratic residue + if (!Fp_sqrt_montg(&a->y, &a->y)) { return POINT_NOT_ON_CURVE; } @@ -539,7 +540,13 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { // uncompressed form. It assumes buffer is of length G1_SER_BYTES The // serialization follows: // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +#if defined(__has_feature) && __has_feature(memory_sanitizer) +// disable memory sanitization in this function because of a use-of-uninitialized-value +// false positive. +void __attribute__((no_sanitize("memory"))) E1_write_bytes(byte *bin, const E1 *a) { +#else void E1_write_bytes(byte *bin, const E1 *a) { +#endif if (E1_is_infty(a)) { // set the infinity bit bin[0] = (G1_SERIALIZATION << 7) | (1 << 6); From e280664405f7ae9c9dfec8d338bc293933a5206c Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 29 Aug 2023 13:45:39 -0600 Subject: [PATCH 154/200] split c-sanitize and disable msan from CI - add NO_MSAN macro --- crypto/Makefile | 36 +++++++++++++++++++++++++++--------- crypto/bls12381_utils.c | 8 +------- crypto/bls12381_utils.h | 19 +++++++++++++++++-- 3 files changed, 45 insertions(+), 18 deletions(-) diff --git a/crypto/Makefile b/crypto/Makefile index 51454b801d8..c58c0f55635 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -41,17 +41,11 @@ c-format: rm -f .clang-format git diff --exit-code -# sanitize C code -# cannot run on macos -.SILENT: c-sanitize -c-sanitize: -# - memory sanitization (only on linux and using clang) - (could use go test -msan) +# address sanitization and other checks +.SILENT: c-asan +c-asan: # - address sanitization and other checks (only on linux) if [ $(UNAME) = "Linux" ]; then \ - $(CGO_FLAG) CC="clang -O0 -g -fsanitize=memory -fno-omit-frame-pointer -fsanitize-memory-track-origins" \ - LD="-fsanitize=memory" go test; \ - if [ $$? -ne 0 ]; then exit 1; fi; \ - \ $(CGO_FLAG) CC="-O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \ LD="-fsanitize=address -fsanitize=leak" go test; \ if [ $$? -ne 0 ]; then exit 1; fi; \ @@ -59,6 +53,30 @@ c-sanitize: echo "sanitization is only supported on Linux"; \ fi; \ +# memory sanitization +.SILENT: c-msan +c-msan: +# - memory sanitization (only on linux and using clang) - (could use go test -msan) +# currently, this leads to many false positives, most likely because of assembly code not handled properly +# by asan. If you would like to run this command, you can use `NO_MSAN` to diable msan in some C functions. +# For instance "void NO_MSAN f() {...}" disables msan in function f. `NO_MSAN` is already defined in +# bls12381_utils.h + if [ $(UNAME) = "Linux" ]; then \ + $(CGO_FLAG) CC="clang -DMSAN -O0 -g -fsanitize=memory -fno-omit-frame-pointer -fsanitize-memory-track-origins" \ + LD="-fsanitize=memory" go test; \ + if [ $$? -ne 0 ]; then exit 1; fi; \ + else \ + echo "sanitization is only supported on Linux"; \ + fi; \ + +# sanitize C code +.SILENT: c-sanitize +c-sanitize: c-asan +# - address sanitization and other checks (only on linux) +# - memory sanitization (target m-san) is disabled because of multiple false positives + + + # Go tidy .PHONY: go-tidy go-tidy: diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 653935c197f..25bffcd6bd8 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -540,13 +540,7 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { // uncompressed form. It assumes buffer is of length G1_SER_BYTES The // serialization follows: // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -#if defined(__has_feature) && __has_feature(memory_sanitizer) -// disable memory sanitization in this function because of a use-of-uninitialized-value -// false positive. -void __attribute__((no_sanitize("memory"))) E1_write_bytes(byte *bin, const E1 *a) { -#else void E1_write_bytes(byte *bin, const E1 *a) { -#endif if (E1_is_infty(a)) { // set the infinity bit bin[0] = (G1_SERIALIZATION << 7) | (1 << 6); @@ -1063,7 +1057,7 @@ void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, } // DEBUG printing functions -#if (DEBUG == 1) +#ifdef DEBUG void bytes_print_(char *s, byte *data, int len) { if (strlen(s)) printf("[%s]:\n", s); diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 1936c151497..2e6f39bd0d5 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -129,8 +129,8 @@ void Fp12_multi_pairing(Fp12 *, const E1 *, const E2 *, const int); void xmd_sha256(byte *, int, byte *, int, byte *, int); // Debugging related functions -#define DEBUG 1 -#if (DEBUG == 1) +// DEBUG can be enabled directly from the Go command: CC="clang -DDEBUG" go test +#ifdef DEBUG #include void bytes_print_(char *, byte *, int); void Fr_print_(char *, Fr *); @@ -139,6 +139,21 @@ void Fp2_print_(char *, const Fp2 *); void Fp12_print_(char *, const Fp12 *); void E1_print_(char *, const E1 *, const int); void E2_print_(char *, const E2 *, const int); + #endif /* DEBUG */ +// memory sanitization disabler +#define NO_MSAN +#ifdef MSAN +/* add NO_MSAN to a function defintion to disable MSAN in that function ( void NO_MSAN f(..) {} ) */ +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +// disable memory sanitization in this function because of a use-of-uninitialized-value +// false positive. +#undef NO_MSAN +#define NO_MSAN __attribute__((no_sanitize("memory"))) +#endif /* __has_feature(memory_sanitizer) */ +#endif /* __has_feature*/ +#endif /*MSAN*/ + #endif /* BLS12_381_UTILS */ \ No newline at end of file From aa8d79eb0c1702b748960d342b18e464b10f3f4b Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 29 Aug 2023 13:52:30 -0600 Subject: [PATCH 155/200] disable tmate and format --- crypto/bls12381_utils.c | 6 ++---- crypto/bls12381_utils.h | 9 +++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 25bffcd6bd8..0f158055fd8 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -21,9 +21,7 @@ const Fr BLS12_381_rR = {{ }}; // returns true if a == 0 and false otherwise -bool Fr_is_zero(const Fr *a) { - return vec_is_zero(a, sizeof(Fr)); -} +bool Fr_is_zero(const Fr *a) { return vec_is_zero(a, sizeof(Fr)); } // returns true if a == b and false otherwise bool Fr_is_equal(const Fr *a, const Fr *b) { @@ -1057,7 +1055,7 @@ void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, } // DEBUG printing functions -#ifdef DEBUG +#ifdef DEBUG void bytes_print_(char *s, byte *data, int len) { if (strlen(s)) printf("[%s]:\n", s); diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 2e6f39bd0d5..fed426eb997 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -129,7 +129,7 @@ void Fp12_multi_pairing(Fp12 *, const E1 *, const E2 *, const int); void xmd_sha256(byte *, int, byte *, int, byte *, int); // Debugging related functions -// DEBUG can be enabled directly from the Go command: CC="clang -DDEBUG" go test +// DEBUG can be enabled directly from the Go command: CC="clang -DDEBUG" go test #ifdef DEBUG #include void bytes_print_(char *, byte *, int); @@ -145,11 +145,12 @@ void E2_print_(char *, const E2 *, const int); // memory sanitization disabler #define NO_MSAN #ifdef MSAN -/* add NO_MSAN to a function defintion to disable MSAN in that function ( void NO_MSAN f(..) {} ) */ +/* add NO_MSAN to a function defintion to disable MSAN in that function ( void + * NO_MSAN f(..) {} ) */ #if defined(__has_feature) #if __has_feature(memory_sanitizer) -// disable memory sanitization in this function because of a use-of-uninitialized-value -// false positive. +// disable memory sanitization in this function because of a +// use-of-uninitialized-value false positive. #undef NO_MSAN #define NO_MSAN __attribute__((no_sanitize("memory"))) #endif /* __has_feature(memory_sanitizer) */ From b2302c96a7d931d8366a00d13b756bb050a20389 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 29 Aug 2023 14:03:49 -0600 Subject: [PATCH 156/200] add missing change --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 21fdbd7834e..c0fd71d4030 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,8 +38,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - name: Install C formatter run: sudo apt-get install -y clang-format - name: Run C formatter and sanitizer for ./crypto From ceab7e0fc31960ebc6332a4a0a555b2b58e0ab5a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 29 Aug 2023 14:09:40 -0600 Subject: [PATCH 157/200] fix asan command --- crypto/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/Makefile b/crypto/Makefile index c58c0f55635..43aae8ef39f 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -46,7 +46,7 @@ c-format: c-asan: # - address sanitization and other checks (only on linux) if [ $(UNAME) = "Linux" ]; then \ - $(CGO_FLAG) CC="-O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \ + $(CGO_FLAG) CC="clang -O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \ LD="-fsanitize=address -fsanitize=leak" go test; \ if [ $$? -ne 0 ]; then exit 1; fi; \ else \ From 8314b0cb0b01cd55dbdb669d1fcdbd4ac5c6953d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 29 Aug 2023 18:23:01 -0600 Subject: [PATCH 158/200] more details about updating BLST version --- crypto/blst_src/README.md | 4 +++- crypto/blst_src/blst_src.c | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index 1234169dbef..f6adff64fea 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -17,11 +17,13 @@ The folder contains: - this `README` file. To upgrade the BLST version: -- [ ] delete all files in this folder but `blst_src.c` and `README.md`. +- [ ] delete all files in this folder (`./blst_src`) but `blst_src.c` and `README.md`. - [ ] open BLST repository on the new version. - [ ] copy all `.c` and `.h` files from `/src/` into this folder. - [ ] delete `server.c` from this folder. +- [ ] update `blst_src.c` if needed. - [ ] copy the folder `/build/` into this folder. +- [ ] move `./blst_src/build/assembly.S` to `./blst_src/build/blst_assembly.S`. - [ ] copy `/bindings/blst.h` and `/bindings/blst_aux.h` into this folder. - [ ] solve all breaking changes that may occur. - [ ] update the commit version on this `README`. diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c index b904a5d52ee..a50649e5788 100644 --- a/crypto/blst_src/blst_src.c +++ b/crypto/blst_src/blst_src.c @@ -1,3 +1,8 @@ +// This file contains all BLST lib C files needed for +// Flow crypto. +// +// The list may need to be updated in a new version of BLST is used. + #include "keygen.c" #include "hash_to_field.c" #include "e1.c" From 4adb5cfa9ad2e28fd6d937455b73289b11ff80e9 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 1 Sep 2023 18:31:05 -0600 Subject: [PATCH 159/200] minor macro improvement --- crypto/bls12381_utils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index fed426eb997..2594786ad36 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -44,12 +44,12 @@ typedef enum { #define G2_BYTES (2 * Fp2_BYTES) // Compressed and uncompressed points -#define COMPRESSED 1 #define UNCOMPRESSED 0 +#define COMPRESSED (UNCOMPRESSED^1) #define G1_SERIALIZATION (COMPRESSED) #define G2_SERIALIZATION (COMPRESSED) -#define G1_SER_BYTES (G1_BYTES / (G1_SERIALIZATION + 1)) -#define G2_SER_BYTES (G2_BYTES / (G2_SERIALIZATION + 1)) +#define G1_SER_BYTES (G1_SERIALIZATION==UNCOMPRESSED ? G1_BYTES : (G1_BYTES/2)) +#define G2_SER_BYTES (G2_SERIALIZATION==UNCOMPRESSED ? G2_BYTES : (G2_BYTES/2)) // Fr utilities extern const Fr BLS12_381_rR; From e6b29bc8e83c673d68373d53cdf0962778a32a1c Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 1 Sep 2023 18:55:42 -0600 Subject: [PATCH 160/200] add types sanity check in init() --- crypto/bls12381_utils.c | 7 +++++++ crypto/bls12381_utils.go | 4 +++- crypto/bls12381_utils.h | 3 +++ crypto/blst_include.h | 2 +- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 0f158055fd8..69ce2ba9c2f 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -9,6 +9,13 @@ // compile all blst C src along with this file #include "blst_src.c" +// make sure flow crypto types are consistent with BLST types +void types_sanity(void) { + assert(sizeof(Fp)==sizeof(vec384)); + assert(sizeof(E1)==sizeof(POINTonE1)); + assert(sizeof(E2)==sizeof(POINTonE2)); +} + // ------------------- Fr utilities // Montgomery constant R related to the curve order r diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index a972ca46b64..e9a72b6a5e5 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -4,7 +4,7 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros -Wno-unused-variable // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls12381_utils.h" @@ -71,6 +71,8 @@ var g2PublicKey pubKeyBLSBLS12381 // initialization of BLS12-381 curve func initBLS12381() { + C.types_sanity() + if isG1Compressed() { g1SerHeader = 0xC0 } else { diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 2594786ad36..134dd21bdc6 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -51,6 +51,9 @@ typedef enum { #define G1_SER_BYTES (G1_SERIALIZATION==UNCOMPRESSED ? G1_BYTES : (G1_BYTES/2)) #define G2_SER_BYTES (G2_SERIALIZATION==UNCOMPRESSED ? G2_BYTES : (G2_BYTES/2)) +// init-related functions +void types_sanity(void); + // Fr utilities extern const Fr BLS12_381_rR; bool Fr_is_zero(const Fr *a); diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 1f0ed3b17ce..dc942b5976b 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -31,7 +31,7 @@ typedef vec384 Fp; // curve E_1 (over F_p) // E_1 points are represented in Jacobian coordinates (x,y,z), // where x, y, x are elements of F_p (type `Fp`). -// `E1` is equivelent to type `POINTonE1` (used internally by BLST for Jacobian +// `E1` is equivalent to type `POINTonE1` (used internally by BLST for Jacobian // E1 elements) `E1` is defined as a struct to be exportable through cgo to the // Go layer. `E1` is also used to represent all subgroup G_1 elements. typedef struct { From 5305b28943981642d0c253d93768e36a73f57443 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 1 Sep 2023 19:34:55 -0600 Subject: [PATCH 161/200] add affine conversions for potential public keys that can be used in muliple pairings --- crypto/bls12381_utils.c | 31 ++++++++++++++++++++++++++----- crypto/bls12381_utils.go | 7 ++++++- crypto/bls12381_utils.h | 3 ++- crypto/bls_multisig.go | 2 +- crypto/dkg_jointfeldman.go | 4 ++-- 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 69ce2ba9c2f..e040d018024 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -708,6 +708,8 @@ const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2; // E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or // uncompressed form. The resulting point is guaranteed to be on curve E2 (no G2 // check is included). +// E2 point is in affine coordinates. This avoids further conversions +// when the point is used in multiple pairing computation. // // returns: // - BAD_ENCODING if the length is invalid or serialization header bits are @@ -878,7 +880,7 @@ void E2_add(E2 *res, const E2 *a, const E2 *b) { } // generic point double that must handle point at infinity -void E2_double(E2 *res, const E2 *a) { +static void E2_double(E2 *res, const E2 *a) { POINTonE2_double((POINTonE2 *)res, (POINTonE2 *)a); } @@ -934,6 +936,15 @@ void G2_mult_gen(E2 *res, const Fr *expo) { vec_zero(&tmp, sizeof(tmp)); } +// Exponentiation of generator g2 of G2, res = expo.g2 +// +// This is useful for results being used multiple times in pairings. +// Conversion to affine saves later pre-pairing conversions. +void G2_mult_gen_to_affine(E2 *res, const Fr *expo) { + G2_mult_gen(res, expo); + E2_to_affine(res, res); +} + // checks if input E2 point is on the subgroup G2. // It assumes input `p` is on E2. bool E2_in_G2(const E2 *p) { @@ -949,6 +960,16 @@ void E2_sum_vector(E2 *sum, const E2 *y, const int len) { } } +// computes the sum of the E2 array elements `y[i]`, converts it +// to affine coordinates, and writes it in `sum`. +// +// This is useful for results being used multiple times in pairings. +// Conversion to affine saves later pre-pairing conversions. +void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int len) { + E2_sum_vector(sum, y, len); + E2_to_affine(sum, sum); +} + // Subtracts all G2 array elements `y` from an element `x` and writes the // result in res void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len) { @@ -1014,7 +1035,7 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) { continue; } // `miller_loop_n` expects affine coordinates in a `POINTonEx_affine` array. - // `POINTonEx_affine` has a different size than `POINTonEx` or `Ex` ! + // `POINTonEx_affine` has a different size than `POINTonEx` and `Ex` ! E1 tmp1; E1_to_affine(&tmp1, p + i); vec_copy(p_aff + n, &tmp1, sizeof(POINTonE1_affine)); @@ -1022,7 +1043,7 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) { E2_to_affine(&tmp2, q + i); vec_copy(q_aff + n, &tmp2, sizeof(POINTonE2_affine)); n++; - if (n == N_MAX) { // if p_ and q_ are filled, batch `N_MAX` miller loops + if (n == N_MAX) { // if p_aff and q_aff are filled, batch `N_MAX` miller loops if (!init_flag) { miller_loop_n(res_vec, q_aff, p_aff, N_MAX); init_flag = 1; @@ -1034,8 +1055,8 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) { n = 0; } } - // if p_ and q_ aren't empty, - // remaining couples are also batched in `n` miller loops + // if p_aff and q_aff aren't empty, + // the remaining couples are also batched in `n` miller loops if (n > 0) { if (!init_flag) { miller_loop_n(res_vec, q_aff, p_aff, n); diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index e9a72b6a5e5..21d9e13af05 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -112,8 +112,13 @@ func generatorScalarMultG1(res *pointE1, expo *scalar) { } // Scalar multiplication of generator g2 in G2 +// +// This often results in a public key that is used in +// multiple pairing computation. Therefore, convert the +// resulting point to affine coordinate to save pre-pairing +// conversions. func generatorScalarMultG2(res *pointE2, expo *scalar) { - C.G2_mult_gen((*C.E2)(res), (*C.Fr)(expo)) + C.G2_mult_gen_to_affine((*C.E2)(res), (*C.Fr)(expo)) } // comparison in Fr where r is the group order of G1/G2 diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 134dd21bdc6..1098144fd7c 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -112,12 +112,13 @@ void E2_to_affine(E2 *, const E2 *); ERROR E2_read_bytes(E2 *, const byte *, const int); void E2_write_bytes(byte *, const E2 *); void G2_mult_gen(E2 *, const Fr *); +void G2_mult_gen_to_affine(E2 *, const Fr *); void E2_mult(E2 *, const E2 *, const Fr *); void E2_mult_small_expo(E2 *, const E2 *, const byte); void E2_add(E2 *res, const E2 *a, const E2 *b); -void E2_double(E2 *res, const E2 *a); void E2_neg(E2 *, const E2 *); void E2_sum_vector(E2 *, const E2 *, const int); +void E2_sum_vector_to_affine(E2 *, const E2 *, const int); void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len); bool E2_in_G2(const E2 *); void unsafe_map_bytes_to_G2(E2 *, const byte *, int); diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index 7f57cd09888..2567aaf2ba0 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -184,7 +184,7 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { } var sum pointE2 - C.E2_sum_vector((*C.E2)(&sum), (*C.E2)(&points[0]), + C.E2_sum_vector_to_affine((*C.E2)(&sum), (*C.E2)(&points[0]), (C.int)(len(points))) sumKey := newPubKeyBLSBLS12381(&sum) diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index 40db316efb5..115730e33d9 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -302,12 +302,12 @@ func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointE2 (C.int)(qualified)) // sum up Y var jointPublicKey pointE2 - C.E2_sum_vector((*C.E2)(&jointPublicKey), + C.E2_sum_vector_to_affine((*C.E2)(&jointPublicKey), (*C.E2)(&qualifiedPubKey[0]), (C.int)(qualified)) // sum up []y jointy := make([]pointE2, s.size) for i := 0; i < s.size; i++ { - C.E2_sum_vector((*C.E2)(&jointy[i]), + C.E2_sum_vector_to_affine((*C.E2)(&jointy[i]), (*C.E2)(&qualifiedy[i][0]), (C.int)(qualified)) } return &jointx, &jointPublicKey, jointy From afa9f240949a1180e49447a1ce06bb458e44bc00 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 5 Sep 2023 19:45:17 -0600 Subject: [PATCH 162/200] clarify some TODOs --- crypto/README.md | 3 --- crypto/bls.go | 3 --- crypto/bls12381_utils.c | 36 ++++++++++++++++++++---------------- crypto/dkg_core.c | 10 ++++++---- crypto/dkg_feldmanvss.go | 2 +- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/crypto/README.md b/crypto/README.md index 97156fa52c9..c15d0a36462 100644 --- a/crypto/README.md +++ b/crypto/README.md @@ -62,9 +62,6 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey` public keys, using a binary tree of aggregations. * SPoCK scheme based on BLS: verifies two signatures have been generated from the same message that is unknown to the verifier. - * Future features: - * support minimal-pubkey-size variant - ### PRNG * ChaCha20-based CSPRNG diff --git a/crypto/bls.go b/crypto/bls.go index b5ed13bd83d..c6f01a6ab28 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -23,9 +23,6 @@ package crypto // - SPoCK scheme based on BLS: verifies two signatures are generated from the same message, // even though the message is unknown to the verifier. -// future features: -// - implement a G1/G2 swap (minimal-pubkey-size variant) - // #include "bls_include.h" import "C" diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index e040d018024..14d98869847 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -140,16 +140,15 @@ ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) { if (len != Fr_BYTES) { return BAD_ENCODING; } + // compare to r using the BLST tool pow256 tmp; - // compare to r using the provided tool from BLST - pow256_from_be_bytes(tmp, bin); // TODO: check endianness!! - if (!check_mod_256( - tmp, - BLS12_381_r)) { // check_mod_256 compares pow256 against a vec256! + pow256_from_be_bytes(tmp, bin); + // (check_mod_256 compares pow256 against a vec256!) + if (!check_mod_256(tmp, BLS12_381_r)) { return BAD_VALUE; } vec_zero(tmp, sizeof(tmp)); - limbs_from_be_bytes((limb_t *)a, bin, Fr_BYTES); // TODO: check endianness!! + limbs_from_be_bytes((limb_t *)a, bin, Fr_BYTES); return VALID; } @@ -177,11 +176,16 @@ void Fr_write_bytes(byte *bin, const Fr *a) { be_bytes_from_limbs(bin, (limb_t *)a, Fr_BYTES); } -// maps big-endian bytes into an Fr element using modular reduction -// Input is byte-big-endian, output is Fr (internally vec256) -// TODO: check redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t -// n0); +// maps big-endian bytes of any size into an Fr element using modular reduction. +// Input is byte-big-endian, output is Fr (internally vec256). +// +// Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t +// n0) to reduce 512 bits at a time. static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) { + // input can be written in base 2^|R|, with R the Montgomery constant + // N = l_1 + L_2*2^|R| .. + L_n*2^(|R|*(n-1)) + // Therefore N mod p can be expressed using R as: + // N mod p = l_1 + L_2*R .. + L_n*R^(n-1) Fr digit, radix; Fr_set_zero(out); Fr_copy(&radix, (Fr *)BLS12_381_rRR); // R^2 @@ -200,7 +204,7 @@ static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) { limbs_from_be_bytes((limb_t *)&digit, p - n, n); Fr_mul_montg(&digit, &digit, &radix); Fr_add(out, out, &digit); - // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n + // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n, // reduce the extra R Fr_from_montg(out, out); // clean up possible sensitive data @@ -463,8 +467,8 @@ bool E1_in_G1(const E1 *p) { // - POINT_NOT_ON_CURVE if deserialized point isn't on E1 // - VALID if deserialization is valid -// TODO: replace with POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, -// and update logic with G2 subgroup check? +// Note: could use POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, +// but needs to update the logic around G2 subgroup check ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { // check the length if (len != G1_SER_BYTES) { @@ -717,9 +721,9 @@ const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2; // - BAD_VALUE if Fp^2 coordinates couldn't deserialize // - POINT_NOT_ON_CURVE if deserialized point isn't on E2 // - VALID if deserialization is valid - -// TODO: replace with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, -// and update logic with G2 subgroup check? +// +// Note: can use with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, +// and update the logic around G2 subgroup check. ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) { // check the length if (len != G2_SER_BYTES) { diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 811f9c84653..e5b3bd5d333 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -65,11 +65,13 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int len) { } } -// The function imports an array of E2 points from a concatenated array of -// bytes. The bytes array is supposed to be in (len * G2_SER_BYTES) -ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int len) { +// The function imports an array of `n` E2 points from a concatenated array of +// bytes. The bytes array is supposed to be of size (n * G2_SER_BYTES). +// +// If return is `VALID`, output vector is guaranteed to be in E2. +ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int n) { byte *p = (byte *)src; - for (int i = 0; i < len; i++) { + for (int i = 0; i < n; i++) { int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES); if (read_ret != VALID) return read_ret; diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 0de83b43dc2..36f486945e7 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -398,7 +398,7 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) { // receives the public vector from the func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { - // only accept the verification vector from the . + // only accept the verification vector from the dealer. if origin != s.dealerIndex { return } From 76850aabf765b81578c7913dfeac6c6f28d20f83 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 5 Sep 2023 19:54:02 -0600 Subject: [PATCH 163/200] DKG's readVector enforces A to be in G2 --- crypto/dkg_core.c | 19 ++++++++++++++++--- crypto/dkg_feldmanvss.go | 8 ++++---- crypto/dkg_include.h | 2 +- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index e5b3bd5d333..af9aac9a560 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -68,13 +68,26 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int len) { // The function imports an array of `n` E2 points from a concatenated array of // bytes. The bytes array is supposed to be of size (n * G2_SER_BYTES). // -// If return is `VALID`, output vector is guaranteed to be in E2. -ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int n) { +// If return is `VALID`, output vector is guaranteed to be in G2. +// It returns other errors if at least one input isn't a serialization of a E2 +// point, or an input E2 point isn't in G2. +// returns: +// - BAD_ENCODING if the serialization header bits of at least one input are invalid. +// - BAD_VALUE if Fp^2 coordinates of at least one input couldn't deserialize. +// - POINT_NOT_ON_CURVE if at least one input deserialized point isn't on E2. +// - POINT_NOT_IN_GROUP if at least one E2 point isn't in G2. +// - VALID if deserialization of all points to G2 is valid. +ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int n) { byte *p = (byte *)src; for (int i = 0; i < n; i++) { int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES); - if (read_ret != VALID) + if (read_ret != VALID) { return read_ret; + } + if (!E2_in_G2(&A[i])) { + return POINT_NOT_IN_GROUP; + } + p += G2_SER_BYTES; } // TODO: add G2 subgroup check? diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 36f486945e7..c89bee98ea1 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -457,10 +457,10 @@ func writeVerifVector(dest []byte, A []pointE2) { ) } -// readVerifVector imports A vector from an array of bytes, -// assuming the slice length matches the vector length +// readVerifVector imports A vector (G2 points) from an array of bytes, +// assuming the slice length matches the vector length. func readVerifVector(A []pointE2, src []byte) error { - read := C.E2_vector_read_bytes( + read := C.G2_vector_read_bytes( (*C.E2)(&A[0]), (*C.uchar)(&src[0]), (C.int)(len(A))) @@ -468,7 +468,7 @@ func readVerifVector(A []pointE2, src []byte) error { return nil } // invalid A vector - return invalidInputsErrorf("the verifcation vector does not serialize valid E2 points: error code %d", read) + return invalidInputsErrorf("the verification vector does not serialize valid G2 points: error code %d", read) } func (s *feldmanVSSstate) verifyShare() bool { diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index fc377f26b4f..05d46187749 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -9,7 +9,7 @@ void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int deg, const byte x); void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, const int deg); void G2_vector_write_bytes(byte *out, const E2 *A, const int len); -ERROR E2_vector_read_bytes(E2 *A, const byte *src, const int len); +ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int len); bool G2_check_log(const Fr *x, const E2 *y); #endif From 589d8d78c39f5967c8743d51cf2c7f8d585eb5b1 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 5 Sep 2023 19:58:08 -0600 Subject: [PATCH 164/200] format --- crypto/bls12381_utils.c | 17 +++++++++-------- crypto/bls12381_utils.h | 8 +++++--- crypto/dkg_core.c | 13 ++++++++----- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 14d98869847..e4636aad457 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -11,9 +11,9 @@ // make sure flow crypto types are consistent with BLST types void types_sanity(void) { - assert(sizeof(Fp)==sizeof(vec384)); - assert(sizeof(E1)==sizeof(POINTonE1)); - assert(sizeof(E2)==sizeof(POINTonE2)); + assert(sizeof(Fp) == sizeof(vec384)); + assert(sizeof(E1) == sizeof(POINTonE1)); + assert(sizeof(E2) == sizeof(POINTonE2)); } // ------------------- Fr utilities @@ -140,11 +140,11 @@ ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) { if (len != Fr_BYTES) { return BAD_ENCODING; } - // compare to r using the BLST tool + // compare to r using the BLST tool pow256 tmp; pow256_from_be_bytes(tmp, bin); // (check_mod_256 compares pow256 against a vec256!) - if (!check_mod_256(tmp, BLS12_381_r)) { + if (!check_mod_256(tmp, BLS12_381_r)) { return BAD_VALUE; } vec_zero(tmp, sizeof(tmp)); @@ -179,8 +179,8 @@ void Fr_write_bytes(byte *bin, const Fr *a) { // maps big-endian bytes of any size into an Fr element using modular reduction. // Input is byte-big-endian, output is Fr (internally vec256). // -// Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t -// n0) to reduce 512 bits at a time. +// Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p, +// limb_t n0) to reduce 512 bits at a time. static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) { // input can be written in base 2^|R|, with R the Montgomery constant // N = l_1 + L_2*2^|R| .. + L_n*2^(|R|*(n-1)) @@ -1047,7 +1047,8 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) { E2_to_affine(&tmp2, q + i); vec_copy(q_aff + n, &tmp2, sizeof(POINTonE2_affine)); n++; - if (n == N_MAX) { // if p_aff and q_aff are filled, batch `N_MAX` miller loops + // if p_aff and q_aff are filled, batch `N_MAX` miller loops + if (n == N_MAX) { if (!init_flag) { miller_loop_n(res_vec, q_aff, p_aff, N_MAX); init_flag = 1; diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 1098144fd7c..b0f96669ed7 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -45,11 +45,13 @@ typedef enum { // Compressed and uncompressed points #define UNCOMPRESSED 0 -#define COMPRESSED (UNCOMPRESSED^1) +#define COMPRESSED (UNCOMPRESSED ^ 1) #define G1_SERIALIZATION (COMPRESSED) #define G2_SERIALIZATION (COMPRESSED) -#define G1_SER_BYTES (G1_SERIALIZATION==UNCOMPRESSED ? G1_BYTES : (G1_BYTES/2)) -#define G2_SER_BYTES (G2_SERIALIZATION==UNCOMPRESSED ? G2_BYTES : (G2_BYTES/2)) +#define G1_SER_BYTES \ + (G1_SERIALIZATION == UNCOMPRESSED ? G1_BYTES : (G1_BYTES / 2)) +#define G2_SER_BYTES \ + (G2_SERIALIZATION == UNCOMPRESSED ? G2_BYTES : (G2_BYTES / 2)) // init-related functions void types_sanity(void); diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index af9aac9a560..f7521aa5ac7 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -67,14 +67,17 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int len) { // The function imports an array of `n` E2 points from a concatenated array of // bytes. The bytes array is supposed to be of size (n * G2_SER_BYTES). -// +// // If return is `VALID`, output vector is guaranteed to be in G2. -// It returns other errors if at least one input isn't a serialization of a E2 +// It returns other errors if at least one input isn't a serialization of a E2 // point, or an input E2 point isn't in G2. // returns: -// - BAD_ENCODING if the serialization header bits of at least one input are invalid. -// - BAD_VALUE if Fp^2 coordinates of at least one input couldn't deserialize. -// - POINT_NOT_ON_CURVE if at least one input deserialized point isn't on E2. +// - BAD_ENCODING if the serialization header bits of at least one input are +// invalid. +// - BAD_VALUE if Fp^2 coordinates of at least one input couldn't +// deserialize. +// - POINT_NOT_ON_CURVE if at least one input deserialized point isn't on +// E2. // - POINT_NOT_IN_GROUP if at least one E2 point isn't in G2. // - VALID if deserialization of all points to G2 is valid. ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int n) { From 14c9e3de1093a7d7e7245d05f18073490f2f17c7 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 6 Sep 2023 11:57:54 -0600 Subject: [PATCH 165/200] clean up c flags and add instruction to readme --- crypto/bls12381_utils.go | 2 +- crypto/bls_thresholdsign.go | 1 - crypto/blst_src/README.md | 1 + crypto/dkg_core.c | 2 -- crypto/dkg_feldmanvss.go | 1 - crypto/dkg_feldmanvssq.go | 1 - 6 files changed, 2 insertions(+), 6 deletions(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 21d9e13af05..a3867b31b20 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -4,7 +4,7 @@ package crypto // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -Wall -Wno-unused-function -Wno-unused-macros -Wno-unused-variable +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -Wall -fno-builtin-memcpy -fno-builtin-memset -Wno-unused-function -Wno-unused-macros -Wno-unused-variable // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls12381_utils.h" diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 9451f4fb6dc..83fb6d6949f 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -1,6 +1,5 @@ package crypto -// #cgo CFLAGS: // #include "bls_thresholdsign_include.h" import "C" diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index f6adff64fea..d283b4dd6c4 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -25,6 +25,7 @@ To upgrade the BLST version: - [ ] copy the folder `/build/` into this folder. - [ ] move `./blst_src/build/assembly.S` to `./blst_src/build/blst_assembly.S`. - [ ] copy `/bindings/blst.h` and `/bindings/blst_aux.h` into this folder. +- [ ] check that C flags in `./bls12381_utils.go` still match the C flags in `/bindings/go/blst.go`. - [ ] solve all breaking changes that may occur. - [ ] update the commit version on this `README`. diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index f7521aa5ac7..f5f48db67ae 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -90,10 +90,8 @@ ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int n) { if (!E2_in_G2(&A[i])) { return POINT_NOT_IN_GROUP; } - p += G2_SER_BYTES; } - // TODO: add G2 subgroup check? return VALID; } diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index c89bee98ea1..2814e59ee14 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -1,6 +1,5 @@ package crypto -// #cgo CFLAGS: // #include "dkg_include.h" import "C" diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index b8056b990dc..c3aca992ee2 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -1,6 +1,5 @@ package crypto -// #cgo CFLAGS: // #include "dkg_include.h" import "C" From 12e338e8bf2f1110648159e4eef15afede554914 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 6 Sep 2023 13:18:57 -0600 Subject: [PATCH 166/200] c format --- crypto/bls_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/bls_core.c b/crypto/bls_core.c index d221f4c2237..aac7d60ee18 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -391,7 +391,8 @@ static void bls_batch_verify_tree(const node *root, const int len, // indices mixup. // - optimize the verification by verifying an aggregated signature against an // aggregated -// public key, and use a top-down recursive verification to find invalid signatures. +// public key, and use a top-down recursive verification to find invalid +// signatures. void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input, const byte *sigs_bytes, const byte *data, const int data_len, const byte *seed) { From 7a268aeea938f0aa2e9f47130595b55b727f9931 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 6 Sep 2023 13:20:16 -0600 Subject: [PATCH 167/200] mod tidy --- go.sum | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/go.sum b/go.sum index 753fb0b8bf2..76d79b1ea73 100644 --- a/go.sum +++ b/go.sum @@ -190,6 +190,7 @@ github.com/btcsuite/btcd v0.20.1-beta/go.mod h1:wVuoA8VJLEcwgqHBwHmzLRazpKxTv13P github.com/btcsuite/btcd v0.21.0-beta/go.mod h1:ZSWyehm27aAuS9bvkATT+Xte3hjHZ+MRgMY/8NJ7K94= github.com/btcsuite/btcd/btcec/v2 v2.2.1 h1:xP60mv8fvp+0khmrN0zTdPC3cNm24rfeE6lh2R/Yv3E= github.com/btcsuite/btcd/btcec/v2 v2.2.1/go.mod h1:9/CSmJxmuvqzX9Wh2fXMWToLOHhPd11lSPuIupwTkI8= +github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc= github.com/btcsuite/btclog v0.0.0-20170628155309-84c8d2346e9f/go.mod h1:TdznJufoqS23FtqVCzL0ZqgP5MqXbb4fg/WgDys70nA= github.com/btcsuite/btcutil v0.0.0-20190207003914-4c204d697803/go.mod h1:+5NJ2+qvTyV9exUAL/rxXi3DcLg2Ts+ymUAY5y4NvMg= github.com/btcsuite/btcutil v0.0.0-20190425235716-9e5f4b9a998d/go.mod h1:+5NJ2+qvTyV9exUAL/rxXi3DcLg2Ts+ymUAY5y4NvMg= @@ -270,7 +271,9 @@ github.com/davidlazar/go-crypto v0.0.0-20170701192655-dcfb0a7ac018/go.mod h1:rQY github.com/davidlazar/go-crypto v0.0.0-20200604182044-b73af7476f6c h1:pFUpOrbxDR6AkioZ1ySsx5yxlDQZ8stG2b88gTPxgJU= github.com/davidlazar/go-crypto v0.0.0-20200604182044-b73af7476f6c/go.mod h1:6UhI8N9EjYm1c2odKpFpAYeR8dsBeM7PtzQhRgxRr9U= github.com/deckarep/golang-set v0.0.0-20180603214616-504e848d77ea/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14yDtF28KmMOgQ= +github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc= github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5ilcvdfma9wOH6Y= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1/go.mod h1:hyedUtir6IdtD/7lIxGeCxkaw7y45JueMRL4DIyJDKs= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 h1:8UrgZ3GkP4i/CLijOJx79Yu+etlyjdBU4sfcs2WYQMs= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0= github.com/decred/dcrd/lru v1.0.0/go.mod h1:mxKOwFd7lFjN2GZYsiz/ecgqR6kkYAl+0pz0tEMk218= @@ -1250,20 +1253,9 @@ github.com/onflow/flow-core-contracts/lib/go/templates v1.2.3 h1:X25A1dNajNUtE+K github.com/onflow/flow-core-contracts/lib/go/templates v1.2.3/go.mod h1:dqAUVWwg+NlOhsuBHex7bEWmsUjsiExzhe/+t4xNH6A= github.com/onflow/flow-ft/lib/go/contracts v0.7.0 h1:XEKE6qJUw3luhsYmIOteXP53gtxNxrwTohgxJXCYqBE= github.com/onflow/flow-ft/lib/go/contracts v0.7.0/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU= -<<<<<<< HEAD -github.com/onflow/flow-go-sdk v0.40.0 h1:s8uwoyTquN8tjdXpqGmNkXTjf79yUII8JExc5QEl4Xw= -github.com/onflow/flow-go-sdk v0.40.0/go.mod h1:34dxXk9Hp/bQw6Zy6+H44Xo0kQU+aJyQoqdDxq00rJM= -github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230424214110-4f04b71ea3e1 h1:QxQxCgce0tvAn/ibnEVYcUFRpy9QLxdfLRavKWYptvU= -github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230424214110-4f04b71ea3e1/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk= -github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8 h1:XcSR/n2aSVO7lOEsKScYALcpHlfowLwicZ9yVbL6bnA= -github.com/onflow/go-bitswap v0.0.0-20221017184039-808c5791a8a8/go.mod h1:73C8FlT4L/Qe4Cf5iXUNL8b2pvu4zs5dJMMJ5V2TjUI= -======= github.com/onflow/flow-go-sdk v0.24.0/go.mod h1:IoptMLPyFXWvyd9yYA6/4EmSeeozl6nJoIv4FaEMg74= github.com/onflow/flow-go-sdk v0.41.10 h1:Cio6GJhtx532TUY+cqrqWglD5sZCXkWeM5QvaRha3p4= github.com/onflow/flow-go-sdk v0.41.10/go.mod h1:0a0LiQFbFt8RW/ptoMUU7YkvW9ArVcbjLE0XS78uz1E= -github.com/onflow/flow-go/crypto v0.21.3/go.mod h1:vI6V4CY3R6c4JKBxdcRiR/AnjBfL8OSD97bJc60cLuQ= -github.com/onflow/flow-go/crypto v0.24.9 h1:0EQp+kSZYJepMIiSypfJVe7tzsPcb6UXOdOtsTCDhBs= -github.com/onflow/flow-go/crypto v0.24.9/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0 h1:rhUDeD27jhLwOqQKI/23008CYfnqXErrJvc4EFRP2a0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0/go.mod h1:YsvzYng4htDgRB9sa9jxdwoTuuhjK8WYWXTyLkIigZY= github.com/onflow/flow/protobuf/go/flow v0.2.2/go.mod h1:gQxYqCfkI8lpnKsmIjwtN2mV/N2PIwc1I+RUK4HPIc8= @@ -1271,7 +1263,6 @@ github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230628215638-83439d22e0ce h1: github.com/onflow/flow/protobuf/go/flow v0.3.2-0.20230628215638-83439d22e0ce/go.mod h1:NA2pX2nw8zuaxfKphhKsk00kWLwfd+tv8mS23YXO4Sk= github.com/onflow/go-bitswap v0.0.0-20230703214630-6d3db958c73d h1:QcOAeEyF3iAUHv21LQ12sdcsr0yFrJGoGLyCAzYYtvI= github.com/onflow/go-bitswap v0.0.0-20230703214630-6d3db958c73d/go.mod h1:GCPpiyRoHncdqPj++zPr9ZOYBX4hpJ0pYZRYqSE8VKk= ->>>>>>> master github.com/onflow/sdks v0.5.0 h1:2HCRibwqDaQ1c9oUApnkZtEAhWiNY2GTpRD5+ftdkN8= github.com/onflow/sdks v0.5.0/go.mod h1:F0dj0EyHC55kknLkeD10js4mo14yTdMotnWMslPirrU= github.com/onflow/wal v0.0.0-20230529184820-bc9f8244608d h1:gAEqYPn3DS83rHIKEpsajnppVD1+zwuYPFyeDVFaQvg= @@ -1519,17 +1510,10 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -<<<<<<< HEAD -github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs= -github.com/subosito/gotenv v1.4.0/go.mod h1:mZd6rFysKEcUhUHXJk0C/08wAgyDBFuwEYL7vWWGaGo= -======= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/subosito/gotenv v1.4.2 h1:X1TuBLAMDFbaTAChgCBLu3DU3UPyELpnF2jjJ2cz/S8= github.com/subosito/gotenv v1.4.2/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0= -github.com/supranational/blst v0.3.4/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw= -github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk= ->>>>>>> master github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA= @@ -1681,7 +1665,6 @@ golang.org/x/crypto v0.0.0-20190618222545-ea8f1a30c443/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200115085410-6d4e4cb37c7d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20200117160349-530e935923ad/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200221231518-2aa609cf4a9d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200311171314-f7b00557c8c4/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200423211502-4bdfaf469ed5/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -1697,6 +1680,7 @@ golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5 golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1801,6 +1785,7 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= @@ -1888,7 +1873,6 @@ golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200107162124-548cf772de50/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= From 7640d9420a540907f733f8914e1cca7646c4a245 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 6 Sep 2023 13:28:12 -0600 Subject: [PATCH 168/200] remove deprecated Seed use --- crypto/bls12381_utils_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index a9efd543ed1..ade31bbb6b9 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -3,7 +3,6 @@ package crypto import ( "crypto/rand" "encoding/hex" - mrand "math/rand" "testing" "github.com/stretchr/testify/assert" @@ -166,7 +165,7 @@ func TestSubgroupCheck(t *testing.T) { // subgroup membership check bench func BenchmarkSubgroupCheck(b *testing.B) { seed := make([]byte, g2BytesLen) - _, err := mrand.Read(seed) + _, err := rand.Read(seed) require.NoError(b, err) b.Run("G1", func(b *testing.B) { From 5c72468678d5682dd76209ccb33692a97f052dc3 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 6 Sep 2023 15:16:32 -0600 Subject: [PATCH 169/200] add temp tmate --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b24de2f44ca..a1dec93631f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -205,6 +205,8 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 - name: Docker build run: make docker-build-flow docker-build-flow-corrupt - name: Run tests From 5011cc6d2888597d0f84385800bbb141ec1161e8 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 6 Sep 2023 23:34:17 -0600 Subject: [PATCH 170/200] fix merging issues and delete relic related builds --- .github/workflows/flaky-test-monitor.yml | 2 -- cmd/bootstrap/README.md | 5 ----- config/README.md | 4 ++-- integration/Makefile | 27 ++++++------------------ integration/benchmark/server/bench.sh | 4 +--- 5 files changed, 9 insertions(+), 33 deletions(-) diff --git a/.github/workflows/flaky-test-monitor.yml b/.github/workflows/flaky-test-monitor.yml index f1c87d03348..c3b662fe070 100644 --- a/.github/workflows/flaky-test-monitor.yml +++ b/.github/workflows/flaky-test-monitor.yml @@ -168,8 +168,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Docker build run: make docker-build-flow docker-build-flow-corrupt - name: Run tests diff --git a/cmd/bootstrap/README.md b/cmd/bootstrap/README.md index f9c2b3f2e79..6b138946ca1 100644 --- a/cmd/bootstrap/README.md +++ b/cmd/bootstrap/README.md @@ -97,12 +97,7 @@ Each input is a config file specified as a command line parameter: #### Example ```bash -<<<<<<< HEAD go run ./cmd/bootstrap finalize \ - --fast-kg \ -======= -go run -tags relic ./cmd/bootstrap finalize \ ->>>>>>> master --root-chain main \ --root-height 0 \ --root-parent 0000000000000000000000000000000000000000000000000000000000000000 \ diff --git a/config/README.md b/config/README.md index f8a31bda478..a7045dd00e1 100644 --- a/config/README.md +++ b/config/README.md @@ -15,12 +15,12 @@ defined. A single default value can be overridden by setting the CLI flag for th config to false. Override entire config file. ```shell -go build -tags relic -o flow-access-node ./cmd/access +go build -o flow-access-node ./cmd/access ./flow-access-node --config-file=config/config.yml ``` Override a single configuration value. ```shell -go build -tags relic -o flow-access-node ./cmd/access +go build -o flow-access-node ./cmd/access ./flow-access-node --network-connection-pruning=false ``` ### Adding a new config value diff --git a/integration/Makefile b/integration/Makefile index 8feb33f72e6..963b7093511 100644 --- a/integration/Makefile +++ b/integration/Makefile @@ -58,16 +58,12 @@ consensus-tests: .PHONY: epochs-cohort1-tests epochs-cohort1-tests: # Use a higher timeout of 20m for the suite of tests which span full epochs -<<<<<<< HEAD - $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 30m ./tests/epochs/... -======= - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic -timeout 20m ./tests/epochs/cohort1/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 20m ./tests/epochs/cohort1/... .PHONY: epochs-cohort2-tests epochs-cohort2-tests: # Use a higher timeout of 20m for the suite of tests which span full epochs - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic -timeout 20m ./tests/epochs/cohort2/... ->>>>>>> master + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 20m ./tests/epochs/cohort2/... .PHONY: ghost-tests ghost-tests: @@ -88,33 +84,22 @@ verification-tests: # upgrades-tests tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel .PHONY: upgrades-tests upgrades-tests: -<<<<<<< HEAD - $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/... -======= - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/upgrades/... -p 1 ->>>>>>> master + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/... -p 1 .PHONY: network-tests network-tests: $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/... # BFT tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel -<<<<<<< HEAD -.PHONY: bft-tests -bft-tests: - $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/... -p 1 - -======= .PHONY: bft-framework-tests bft-framework-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/framework/... -p 1 + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/framework/... -p 1 .PHONY: bft-protocol-tests bft-protocol-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/protocol/... -p 1 + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/protocol/... -p 1 .PHONY: bft-gossipsub-tests bft-gossipsub-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/gossipsub/... -p 1 ->>>>>>> master + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/gossipsub/... -p 1 .PHONY: bft-tests bft-tests: bft-framework-tests bft-protocol-tests bft-gossipsub-tests diff --git a/integration/benchmark/server/bench.sh b/integration/benchmark/server/bench.sh index 6ada16119a1..8c87214a3b1 100755 --- a/integration/benchmark/server/bench.sh +++ b/integration/benchmark/server/bench.sh @@ -22,8 +22,6 @@ while read -r branch_hash; do git log --oneline | head -1 git describe - make -C ../.. crypto_setup_gopath - # instead of running "make stop" which uses docker-compose for a lot of older versions, # we explicitly run the command here with "docker compose" DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker compose -f docker-compose.nodes.yml down -v --remove-orphans @@ -36,7 +34,7 @@ while read -r branch_hash; do # sleep is workaround for slow initialization of some node types, so that benchmark does not quit immediately with "connection refused" sleep 30; - go run -tags relic ../benchmark/cmd/ci -log-level debug -git-repo-path ../../ -tps-initial 800 -tps-min 1 -tps-max 1200 -duration 30m + go run ../benchmark/cmd/ci -log-level debug -git-repo-path ../../ -tps-initial 800 -tps-min 1 -tps-max 1200 -duration 30m # instead of running "make stop" which uses docker-compose for a lot of older versions, # we explicitly run the command here with "docker compose" From c52da9c2d30ee14701894e53cdc6b401b6b50718 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 6 Sep 2023 23:34:32 -0600 Subject: [PATCH 171/200] Revert "add temp tmate" This reverts commit 5c72468678d5682dd76209ccb33692a97f052dc3. --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1dec93631f..b24de2f44ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -205,8 +205,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - name: Docker build run: make docker-build-flow docker-build-flow-corrupt - name: Run tests From e5c0630086fd083bd315031b229ff870ce0c6ef9 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 7 Sep 2023 00:18:25 -0600 Subject: [PATCH 172/200] update flakey test monitor --- .github/workflows/flaky-test-monitor.yml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/flaky-test-monitor.yml b/.github/workflows/flaky-test-monitor.yml index c3b662fe070..aa9d99dd65b 100644 --- a/.github/workflows/flaky-test-monitor.yml +++ b/.github/workflows/flaky-test-monitor.yml @@ -83,18 +83,15 @@ jobs: matrix: include: - name: crypto - make1: -C crypto setup - make2: unittest + setup: race: 1 test_category: unit-crypto - name: insecure - make1: install-tools - make2: test + setup: install-tools race: 0 test_category: unit-insecure - name: integration - make1: install-tools - make2: test + setup: install-tools race: 0 test_category: unit-integration runs-on: ubuntu-latest @@ -107,11 +104,11 @@ jobs: go-version: ${{ env.GO_VERSION }} cache: true - name: Setup tests (${{ matrix.name }}) - run: make ${{ matrix.make1 }} + run: make ${{ matrix.setup }} - name: Run tests (${{ matrix.name }}) env: RACE_DETECTOR: ${{ matrix.race }} - run: make -es -C ${{ matrix.name }} ${{ matrix.make2 }} > test-output + run: make -es -C ${{ matrix.name }} test > test-output timeout-minutes: 100 continue-on-error: true - name: Process test results (${{ matrix.name }}) From d1776c83e9212861a77ad1e9ca462a0029f86bc3 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 7 Sep 2023 13:18:18 -0600 Subject: [PATCH 173/200] more clarifications to BLST version update README --- crypto/blst_src/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index d283b4dd6c4..d8d8be5313a 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -17,16 +17,16 @@ The folder contains: - this `README` file. To upgrade the BLST version: -- [ ] delete all files in this folder (`./blst_src`) but `blst_src.c` and `README.md`. +- [ ] delete all files in this folder `./blst_src/` but `blst_src.c` and `README.md`. - [ ] open BLST repository on the new version. -- [ ] copy all `.c` and `.h` files from `/src/` into this folder. -- [ ] delete `server.c` from this folder. -- [ ] update `blst_src.c` if needed. -- [ ] copy the folder `/build/` into this folder. -- [ ] move `./blst_src/build/assembly.S` to `./blst_src/build/blst_assembly.S`. -- [ ] copy `/bindings/blst.h` and `/bindings/blst_aux.h` into this folder. -- [ ] check that C flags in `./bls12381_utils.go` still match the C flags in `/bindings/go/blst.go`. +- [ ] copy all `.c` and `.h` files from `/src/` into `./blst_src/`. +- [ ] delete `./blst_src/server.c`. +- [ ] copy the folder `/build/` into this folder `./blst_src`. +- [ ] move `./blst_src/build/assembly.S` to `./blst_assembly.S`. +- [ ] copy `/bindings/blst.h` and `/bindings/blst_aux.h` into `./blst_src`. +- [ ] update `./blst_src/blst_src.c` if needed. +- [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `/bindings/go/blst.go`. - [ ] solve all breaking changes that may occur. -- [ ] update the commit version on this `README`. +- [ ] update the commit version on this `./blst_src/README`. Remember that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should made along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. \ No newline at end of file From ecf702108f441d76aacc88cd5bf52ca67bdff0fc Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 7 Sep 2023 13:51:42 -0600 Subject: [PATCH 174/200] update BLST source to v0.3.11 --- crypto/blst_assembly.S | 49 ++- crypto/blst_src/LICENSE | 201 ------------ crypto/blst_src/README.md | 9 +- crypto/blst_src/aggregate.c | 2 +- crypto/blst_src/blst_src.c | 3 +- crypto/blst_src/build/bindings_trim.pl | 29 +- .../blst_src/build/coff/add_mod_256-x86_64.s | 105 ++++--- .../blst_src/build/coff/add_mod_384-x86_64.s | 187 ++++++----- .../build/coff/add_mod_384x384-x86_64.s | 28 +- .../build/coff/ct_inverse_mod_256-armv8.S | 5 +- .../build/coff/ct_inverse_mod_256-x86_64.s | 14 +- .../build/coff/ct_inverse_mod_384-armv8.S | 5 +- .../build/coff/ct_is_square_mod_384-armv8.S | 1 + .../build/coff/ct_is_square_mod_384-x86_64.s | 10 +- .../build/coff/ctq_inverse_mod_384-x86_64.s | 15 +- .../build/coff/ctx_inverse_mod_384-x86_64.s | 21 +- crypto/blst_src/build/coff/div3w-armv8.S | 2 +- crypto/blst_src/build/coff/div3w-x86_64.s | 126 +++++++- .../build/coff/mulq_mont_256-x86_64.s | 57 +++- .../build/coff/mulq_mont_384-x86_64.s | 251 ++++++++++----- .../build/coff/mulx_mont_256-x86_64.s | 44 ++- .../build/coff/mulx_mont_384-x86_64.s | 207 ++++++++----- crypto/blst_src/build/coff/sha256-armv8.S | 8 +- .../build/coff/sha256-portable-x86_64.s | 54 ++-- crypto/blst_src/build/coff/sha256-x86_64.s | 182 +++++------ .../build/elf/ct_inverse_mod_256-armv8.S | 5 +- .../build/elf/ct_inverse_mod_256-x86_64.s | 1 + .../build/elf/ct_inverse_mod_384-armv8.S | 5 +- .../build/elf/ct_is_square_mod_384-armv8.S | 1 + .../build/elf/ct_is_square_mod_384-x86_64.s | 1 + .../build/elf/ctq_inverse_mod_384-x86_64.s | 6 + .../build/elf/ctx_inverse_mod_384-x86_64.s | 14 +- crypto/blst_src/build/elf/div3w-armv8.S | 2 +- crypto/blst_src/build/elf/div3w-x86_64.s | 15 +- .../blst_src/build/elf/mulq_mont_256-x86_64.s | 17 + .../blst_src/build/elf/mulq_mont_384-x86_64.s | 119 +++++-- .../blst_src/build/elf/mulx_mont_256-x86_64.s | 4 + .../blst_src/build/elf/mulx_mont_384-x86_64.s | 69 +++-- crypto/blst_src/build/elf/sha256-armv8.S | 8 +- .../build/elf/sha256-portable-x86_64.s | 36 ++- crypto/blst_src/build/elf/sha256-x86_64.s | 67 ++-- .../build/mach-o/ct_inverse_mod_256-armv8.S | 5 +- .../build/mach-o/ct_inverse_mod_256-x86_64.s | 1 + .../build/mach-o/ct_inverse_mod_384-armv8.S | 5 +- .../build/mach-o/ct_is_square_mod_384-armv8.S | 1 + .../mach-o/ct_is_square_mod_384-x86_64.s | 1 + .../build/mach-o/ctq_inverse_mod_384-x86_64.s | 6 + .../build/mach-o/ctx_inverse_mod_384-x86_64.s | 10 +- crypto/blst_src/build/mach-o/div3w-armv8.S | 2 +- crypto/blst_src/build/mach-o/div3w-x86_64.s | 15 +- .../build/mach-o/mulq_mont_256-x86_64.s | 17 + .../build/mach-o/mulq_mont_384-x86_64.s | 103 ++++-- .../build/mach-o/mulx_mont_256-x86_64.s | 4 + .../build/mach-o/mulx_mont_384-x86_64.s | 53 ++-- crypto/blst_src/build/mach-o/sha256-armv8.S | 8 +- .../build/mach-o/sha256-portable-x86_64.s | 36 ++- crypto/blst_src/build/mach-o/sha256-x86_64.s | 67 ++-- crypto/blst_src/build/refresh.sh | 48 +++ .../build/win64/add_mod_256-x86_64.asm | 115 +++---- .../build/win64/add_mod_384-x86_64.asm | 215 +++++++------ .../build/win64/add_mod_384x384-x86_64.asm | 32 +- crypto/blst_src/build/win64/blst.def | 4 + .../build/win64/ct_inverse_mod_256-armv8.asm | 5 +- .../build/win64/ct_inverse_mod_256-x86_64.asm | 21 +- .../build/win64/ct_inverse_mod_384-armv8.asm | 5 +- .../win64/ct_is_square_mod_384-armv8.asm | 1 + .../win64/ct_is_square_mod_384-x86_64.asm | 15 +- .../win64/ctq_inverse_mod_384-x86_64.asm | 24 +- .../win64/ctx_inverse_mod_384-x86_64.asm | 30 +- crypto/blst_src/build/win64/div3w-armv8.asm | 2 +- crypto/blst_src/build/win64/div3w-x86_64.asm | 117 ++++++- .../build/win64/mulq_mont_256-x86_64.asm | 69 +++-- .../build/win64/mulq_mont_384-x86_64.asm | 292 ++++++++++++------ .../build/win64/mulx_mont_256-x86_64.asm | 54 ++-- .../build/win64/mulx_mont_384-x86_64.asm | 246 +++++++++------ crypto/blst_src/build/win64/sha256-armv8.asm | 8 +- crypto/blst_src/build/win64/sha256-x86_64.asm | 189 ++++++------ crypto/blst_src/bulk_addition.c | 12 +- crypto/blst_src/bytes.h | 4 +- crypto/blst_src/client_min_pk.c | 17 + crypto/blst_src/client_min_sig.c | 17 + crypto/blst_src/cpuid.c | 85 +++++ crypto/blst_src/e1.c | 8 +- crypto/blst_src/e2.c | 8 +- crypto/blst_src/ec_mult.h | 5 +- crypto/blst_src/exports.c | 120 ++++--- crypto/blst_src/fields.h | 4 +- crypto/blst_src/fp12_tower.c | 4 +- crypto/blst_src/multi_scalar.c | 15 +- crypto/blst_src/pairing.c | 49 +++ crypto/blst_src/pentaroot.c | 4 +- crypto/blst_src/vect.h | 16 +- 92 files changed, 2650 insertions(+), 1529 deletions(-) delete mode 100644 crypto/blst_src/LICENSE create mode 100755 crypto/blst_src/build/refresh.sh create mode 100644 crypto/blst_src/client_min_pk.c create mode 100644 crypto/blst_src/client_min_sig.c create mode 100644 crypto/blst_src/cpuid.c diff --git a/crypto/blst_assembly.S b/crypto/blst_assembly.S index a1a7c5416e0..c0c5db30850 100644 --- a/crypto/blst_assembly.S +++ b/crypto/blst_assembly.S @@ -2,23 +2,22 @@ # if defined(__ELF__) # if defined(__BLST_PORTABLE__) # include "elf/sha256-portable-x86_64.s" -# else -# include "elf/sha256-x86_64.s" +# define blst_sha256_block_data_order blst_sha256_block_ssse3 # endif -# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/sha256-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "elf/ctx_inverse_mod_384-x86_64.s" -# else +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "elf/ctq_inverse_mod_384-x86_64.s" # endif # include "elf/add_mod_384-x86_64.s" # include "elf/add_mod_384x384-x86_64.s" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# define __sub_mod_384x384 __sub_mont_384x384 -# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "elf/mulx_mont_384-x86_64.s" # include "elf/mulx_mont_256-x86_64.s" -# else +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "elf/mulq_mont_384-x86_64.s" # include "elf/mulq_mont_256-x86_64.s" # endif @@ -27,25 +26,20 @@ # include "elf/div3w-x86_64.s" # include "elf/ct_is_square_mod_384-x86_64.s" # elif defined(_WIN64) || defined(__CYGWIN__) -# if defined(__BLST_PORTABLE__) -# include "coff/sha256-portable-x86_64.s" -# else -# include "coff/sha256-x86_64.s" -# endif -# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/sha256-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "coff/ctx_inverse_mod_384-x86_64.s" -# else +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "coff/ctq_inverse_mod_384-x86_64.s" # endif # include "coff/add_mod_384-x86_64.s" # include "coff/add_mod_384x384-x86_64.s" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# define __sub_mod_384x384 __sub_mont_384x384 -# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "coff/mulx_mont_384-x86_64.s" # include "coff/mulx_mont_256-x86_64.s" -# else +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "coff/mulq_mont_384-x86_64.s" # include "coff/mulq_mont_256-x86_64.s" # endif @@ -55,20 +49,19 @@ # include "coff/ct_is_square_mod_384-x86_64.s" # elif defined(__APPLE__) # include "mach-o/sha256-x86_64.s" -# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "mach-o/ctx_inverse_mod_384-x86_64.s" -# else +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "mach-o/ctq_inverse_mod_384-x86_64.s" # endif # include "mach-o/add_mod_384-x86_64.s" # include "mach-o/add_mod_384x384-x86_64.s" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# define __sub_mod_384x384 __sub_mont_384x384 -# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# if defined(__ADX__) || defined(__BLST_PORTABLE__) # include "mach-o/mulx_mont_384-x86_64.s" # include "mach-o/mulx_mont_256-x86_64.s" -# else +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) # include "mach-o/mulq_mont_384-x86_64.s" # include "mach-o/mulq_mont_256-x86_64.s" # endif diff --git a/crypto/blst_src/LICENSE b/crypto/blst_src/LICENSE deleted file mode 100644 index 261eeb9e9f8..00000000000 --- a/crypto/blst_src/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index d8d8be5313a..ff63254bbe5 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -1,5 +1,5 @@ -All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst -specifically from the commit <92c12ac58095de04e776cec5ef5ce5bdf242b693>. +All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst, +specifically from the tagged version v0.3.11. Copyright Supranational LLC Licensed under the Apache License, Version 2.0, see LICENSE for details. @@ -10,10 +10,8 @@ While BLST exports multiple functions and tools, the implementation in Flow cryp The folder contains: - BLST LICENSE file - all `/src/*.c` and `/src/*.h` files (C source files) but `server.c`. -- `server.c` is replaced by `blst_src.c` (which lists only the files needed by Flow crypto). +- `server.c` is replaced by `./blst_src.c` (which lists only the files needed by Flow crypto). - all `/build` (assembly generated files). -- `/bindings/blst.h` (headers of external functions). -- `/bindings/blst_aux.h` (headers of external aux functions). - this `README` file. To upgrade the BLST version: @@ -23,7 +21,6 @@ To upgrade the BLST version: - [ ] delete `./blst_src/server.c`. - [ ] copy the folder `/build/` into this folder `./blst_src`. - [ ] move `./blst_src/build/assembly.S` to `./blst_assembly.S`. -- [ ] copy `/bindings/blst.h` and `/bindings/blst_aux.h` into `./blst_src`. - [ ] update `./blst_src/blst_src.c` if needed. - [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `/bindings/go/blst.go`. - [ ] solve all breaking changes that may occur. diff --git a/crypto/blst_src/aggregate.c b/crypto/blst_src/aggregate.c index 8a24e0590ba..ca78876acad 100644 --- a/crypto/blst_src/aggregate.c +++ b/crypto/blst_src/aggregate.c @@ -90,7 +90,7 @@ const void *blst_pairing_get_dst(const PAIRING *ctx) /* * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated - * signature vetification as discussed at + * signature verification as discussed at * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407. * Usage pattern is not finalized yet, because (sig != NULL) is better and * will be handled separately... diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c index a50649e5788..9e064657e72 100644 --- a/crypto/blst_src/blst_src.c +++ b/crypto/blst_src/blst_src.c @@ -11,13 +11,14 @@ #include "map_to_g2.c" #include "fp12_tower.c" #include "pairing.c" -#include "aggregate.c" #include "exp.c" #include "sqrt.c" #include "recip.c" +#include "aggregate.c" #include "bulk_addition.c" #include "multi_scalar.c" #include "consts.c" #include "vect.c" #include "exports.c" + diff --git a/crypto/blst_src/build/bindings_trim.pl b/crypto/blst_src/build/bindings_trim.pl index 90f914578d9..0880352d79e 100755 --- a/crypto/blst_src/build/bindings_trim.pl +++ b/crypto/blst_src/build/bindings_trim.pl @@ -5,6 +5,10 @@ # traverse and remove auto-generated PartialEq for chosen types for (my $i = 0; $i <= $#file; $i++) { + if (@file[$i] =~ m/pub\s+(?:struct|enum)\s+(\w+)/) { + push @structs, $1; + } + if (@file[$i] =~ m/struct\s+blst_p[12]/) { @file[$i-1] =~ s/,\s*PartialEq//; } elsif (@file[$i] =~ m/struct\s+blst_fp12/) { @@ -15,23 +19,22 @@ @file[$i-1] =~ s/,\s*Copy//; @file[$i-1] =~ s/\)/, Zeroize\)/; splice @file, $i, 0, "#[zeroize(drop)]\n"; $i++; - } elsif (@file[$i] =~ m/assert_eq!\($/) { - @file[++$i] =~ s/unsafe\s*\{\s*&\(\*\(::std::ptr::null::<(\w+)>\(\)\)\)\.(\w+).*\}/offsetof!($1, $2)/; + } else { + @file[$i] =~ s/::std::/::core::/g; } } +print @file; + print << '___'; -#[cfg(test)] -macro_rules! offsetof { - ($type:ty, $field:tt) => { - { - let v = <$type>::default(); - (&v.$field as *const _ as usize) - (&v as *const _ as usize) - } - }; -} +#[test] +fn bindgen_test_normal_types() { + // from "Rust for Rustaceans" by Jon Gjengset + fn is_normal() {} ___ -# print the file -print @file; +for (@structs) { + print " is_normal::<$_>();\n"; +} +print "}\n"; close STDOUT; diff --git a/crypto/blst_src/build/coff/add_mod_256-x86_64.s b/crypto/blst_src/build/coff/add_mod_256-x86_64.s index f88e6189ca5..c2c83502a18 100644 --- a/crypto/blst_src/build/coff/add_mod_256-x86_64.s +++ b/crypto/blst_src/build/coff/add_mod_256-x86_64.s @@ -10,14 +10,14 @@ add_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx subq $8,%rsp @@ -81,13 +81,13 @@ mul_by_3_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_3_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx pushq %rbx pushq %r12 @@ -161,14 +161,14 @@ lshift_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_lshift_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -218,14 +218,14 @@ rshift_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_rshift_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx subq $8,%rsp @@ -307,14 +307,14 @@ cneg_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_cneg_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -385,14 +385,14 @@ sub_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx subq $8,%rsp @@ -454,10 +454,10 @@ check_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_check_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi + movq %rcx,%rdi + movq %rdx,%rsi movq 0(%rdi),%rax movq 8(%rdi),%r9 movq 16(%rdi),%r10 @@ -497,14 +497,14 @@ add_n_check_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_n_check_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx subq $8,%rsp @@ -573,14 +573,14 @@ sub_n_check_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_n_check_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx subq $8,%rsp @@ -744,8 +744,9 @@ sub_n_check_mod_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_add_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 @@ -753,7 +754,8 @@ sub_n_check_mod_256: .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -764,8 +766,9 @@ sub_n_check_mod_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_by_3_mod_256_body: .byte 1,0,11,0 .byte 0x00,0xc4,0x00,0x00 @@ -785,8 +788,9 @@ sub_n_check_mod_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_lshift_mod_256_body: .byte 1,0,11,0 .byte 0x00,0xc4,0x00,0x00 @@ -806,8 +810,9 @@ sub_n_check_mod_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_rshift_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 @@ -815,7 +820,8 @@ sub_n_check_mod_256: .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_rshift_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -826,8 +832,9 @@ sub_n_check_mod_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_cneg_mod_256_body: .byte 1,0,11,0 .byte 0x00,0xc4,0x00,0x00 @@ -847,8 +854,9 @@ sub_n_check_mod_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sub_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 @@ -856,7 +864,8 @@ sub_n_check_mod_256: .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -873,8 +882,9 @@ sub_n_check_mod_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_add_n_check_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 @@ -882,7 +892,8 @@ sub_n_check_mod_256: .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_add_n_check_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -893,8 +904,9 @@ sub_n_check_mod_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sub_n_check_mod_256_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 @@ -902,7 +914,8 @@ sub_n_check_mod_256: .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_n_check_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/add_mod_384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384-x86_64.s index d1c7ad6e689..3ef562a3bf2 100644 --- a/crypto/blst_src/build/coff/add_mod_384-x86_64.s +++ b/crypto/blst_src/build/coff/add_mod_384-x86_64.s @@ -10,14 +10,14 @@ add_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -118,14 +118,14 @@ add_mod_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_mod_384x: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -186,14 +186,14 @@ rshift_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_rshift_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -315,13 +315,13 @@ div_by_2_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_div_by_2_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx pushq %rbx pushq %r12 @@ -387,14 +387,14 @@ lshift_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_lshift_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -528,13 +528,13 @@ mul_by_3_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_3_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx pushq %rbx pushq %r12 @@ -595,13 +595,13 @@ mul_by_8_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_8_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx pushq %rbx pushq %r12 @@ -669,13 +669,13 @@ mul_by_3_mod_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_3_mod_384x: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx pushq %rbx pushq %r12 @@ -752,13 +752,13 @@ mul_by_8_mod_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_8_mod_384x: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx pushq %rbx pushq %r12 @@ -845,14 +845,14 @@ cneg_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_cneg_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -952,14 +952,14 @@ sub_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -1058,14 +1058,14 @@ sub_mod_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_mod_384x: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -1124,13 +1124,13 @@ mul_by_1_plus_i_mod_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_by_1_plus_i_mod_384x: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx pushq %rbx pushq %r12 @@ -1274,10 +1274,10 @@ sgn0_pty_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0_pty_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi + movq %rcx,%rdi + movq %rdx,%rsi .LSEH_body_sgn0_pty_mod_384: movq 0(%rdi),%r8 @@ -1328,12 +1328,12 @@ sgn0_pty_mod_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0_pty_mod_384x: - movq %rcx,%rdi - movq %rdx,%rsi pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi pushq %rbx subq $8,%rsp @@ -2134,8 +2134,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_add_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2147,7 +2148,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2158,8 +2160,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_add_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 @@ -2171,7 +2174,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2182,8 +2186,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_rshift_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2195,7 +2200,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_rshift_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2206,8 +2212,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_div_by_2_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2219,7 +2226,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_div_by_2_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2230,8 +2238,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_lshift_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2243,7 +2252,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_lshift_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2254,8 +2264,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_by_3_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2267,7 +2278,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_3_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2278,8 +2290,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_by_8_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2291,7 +2304,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_8_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2302,8 +2316,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_by_3_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2315,7 +2330,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_3_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2326,8 +2342,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_by_8_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2339,7 +2356,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_8_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2350,8 +2368,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_cneg_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2363,7 +2382,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_cneg_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2374,8 +2394,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sub_mod_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -2387,7 +2408,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2398,8 +2420,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sub_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 @@ -2411,7 +2434,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2422,8 +2446,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_by_1_plus_i_mod_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x07,0x00 @@ -2435,7 +2460,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x0e,0x00 .byte 0x00,0x64,0x0f,0x00 .byte 0x00,0xc2 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2446,8 +2472,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sgn0_pty_mod_384_body: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -2463,8 +2490,9 @@ vec_is_equal_16x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sgn0_pty_mod_384x_body: .byte 1,0,9,0 .byte 0x00,0x34,0x01,0x00 @@ -2472,7 +2500,8 @@ vec_is_equal_16x: .byte 0x00,0x74,0x04,0x00 .byte 0x00,0x64,0x05,0x00 .byte 0x00,0x22 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mod_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s index 79976cc0e7a..53662b4a56a 100644 --- a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s +++ b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s @@ -145,14 +145,14 @@ add_mod_384x384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_add_mod_384x384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -202,14 +202,14 @@ sub_mod_384x384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sub_mod_384x384: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -280,8 +280,9 @@ sub_mod_384x384: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_add_mod_384x384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -293,7 +294,8 @@ sub_mod_384x384: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_add_mod_384x384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -304,8 +306,9 @@ sub_mod_384x384: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sub_mod_384x384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -317,7 +320,8 @@ sub_mod_384x384: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sub_mod_384x384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S index 17c3d25278f..d2fd83182b4 100644 --- a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S +++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S @@ -1,6 +1,7 @@ .text .globl ct_inverse_mod_256 + .def ct_inverse_mod_256; .type 32; .endef @@ -62,14 +63,14 @@ ct_inverse_mod_256: madd x4, x16, x8, xzr // |u|*|f0| madd x4, x17, x9, x4 // |v|*|g0| str x4, [x0,#8*4] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*5] stp x5, x5, [x0,#8*7] madd x4, x12, x8, xzr // |u|*|f1| madd x4, x13, x9, x4 // |v|*|g1| str x4, [x0,#8*9] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] eor x1, x1, #256 // flip-flop src |a|b|u|v| diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s index e7d4a6313b1..d1aa7597bc0 100644 --- a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s +++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s @@ -1,6 +1,7 @@ .text .globl ct_inverse_mod_256 + .def ct_inverse_mod_256; .scl 2; .type 32; .endef .p2align 5 ct_inverse_mod_256: @@ -9,14 +10,14 @@ ct_inverse_mod_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_ct_inverse_mod_256: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx pushq %rbx pushq %r12 @@ -1188,8 +1189,9 @@ __inner_loop_62_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_ct_inverse_mod_256_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x86,0x00 @@ -1201,6 +1203,8 @@ __inner_loop_62_256: .byte 0x00,0x74,0x8d,0x00 .byte 0x00,0x64,0x8e,0x00 .byte 0x00,0x01,0x8c,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_ct_inverse_mod_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S index 65193f1e96a..86fdc405828 100644 --- a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S +++ b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S @@ -1,6 +1,7 @@ .text .globl ct_inverse_mod_383 + .def ct_inverse_mod_383; .type 32; .endef @@ -73,7 +74,7 @@ ct_inverse_mod_383: adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] @@ -84,7 +85,7 @@ ct_inverse_mod_383: adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*14] stp x5, x5, [x0,#8*16] eor x1, x1, #256 // flip-flop src |a|b|u|v| diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S index 34336ff486b..efe90a82144 100644 --- a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S +++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S @@ -1,6 +1,7 @@ .text .globl ct_is_square_mod_384 + .def ct_is_square_mod_384; .type 32; .endef diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s index ee4790321e6..9ac32f50852 100644 --- a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s +++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s @@ -1,6 +1,7 @@ .text .globl ct_is_square_mod_384 + .def ct_is_square_mod_384; .scl 2; .type 32; .endef .p2align 5 ct_is_square_mod_384: @@ -9,12 +10,12 @@ ct_is_square_mod_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_ct_is_square_mod_384: - movq %rcx,%rdi - movq %rdx,%rsi pushq %rbp + movq %rcx,%rdi + movq %rdx,%rsi pushq %rbx pushq %r12 @@ -484,8 +485,9 @@ __inner_loop_48: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_ct_is_square_mod_384_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x43,0x00 @@ -497,6 +499,8 @@ __inner_loop_48: .byte 0x00,0x74,0x4a,0x00 .byte 0x00,0x64,0x4b,0x00 .byte 0x00,0x01,0x49,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_ct_is_square_mod_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s index 42f058a3c8d..d027a6dc5c0 100644 --- a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s +++ b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s @@ -1,6 +1,8 @@ +.comm __blst_platform_cap,4 .text .globl ct_inverse_mod_383 + .def ct_inverse_mod_383; .scl 2; .type 32; .endef .p2align 5 ct_inverse_mod_383: @@ -9,12 +11,16 @@ ct_inverse_mod_383: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_ct_inverse_mod_383: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz ct_inverse_mod_383$1 +#endif pushq %rbp pushq %rbx @@ -1200,8 +1206,9 @@ __inner_loop_62: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_ct_inverse_mod_383_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x8b,0x00 @@ -1213,6 +1220,8 @@ __inner_loop_62: .byte 0x00,0x74,0x92,0x00 .byte 0x00,0x64,0x93,0x00 .byte 0x00,0x01,0x91,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_ct_inverse_mod_383_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s index 7c13e56eb2a..4f7dd6d1552 100644 --- a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s +++ b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s @@ -1,6 +1,7 @@ .text .globl ctx_inverse_mod_383 + .def ctx_inverse_mod_383; .scl 2; .type 32; .endef .p2align 5 ctx_inverse_mod_383: @@ -9,12 +10,13 @@ ctx_inverse_mod_383: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_ctx_inverse_mod_383: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +ct_inverse_mod_383$1: pushq %rbp pushq %rbx @@ -812,7 +814,7 @@ ctx_inverse_mod_383: movq 48(%rsi),%r10 - call __inner_loop_62 + call __tail_loop_53 @@ -1514,9 +1516,9 @@ __inner_loop_31: .byte 0xf3,0xc3 -.def __inner_loop_62; .scl 3; .type 32; .endef +.def __tail_loop_53; .scl 3; .type 32; .endef .p2align 5 -__inner_loop_62: +__tail_loop_53: .byte 0xf3,0x0f,0x1e,0xfa movq $1,%rdx @@ -1524,7 +1526,7 @@ __inner_loop_62: xorq %r12,%r12 movq $1,%r13 -.Loop_62: +.Loop_53: xorq %rax,%rax testq $1,%r8 movq %r10,%rbx @@ -1551,7 +1553,7 @@ __inner_loop_62: subq %rax,%rdx subq %rbx,%rcx subl $1,%edi - jnz .Loop_62 + jnz .Loop_53 .byte 0xf3,0xc3 @@ -1575,8 +1577,9 @@ __inner_loop_62: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_ctx_inverse_mod_383_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x8b,0x00 @@ -1588,6 +1591,8 @@ __inner_loop_62: .byte 0x00,0x74,0x92,0x00 .byte 0x00,0x64,0x93,0x00 .byte 0x00,0x01,0x91,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_ctx_inverse_mod_383_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/div3w-armv8.S b/crypto/blst_src/build/coff/div3w-armv8.S index c17b9e38336..2e5d7045d6a 100644 --- a/crypto/blst_src/build/coff/div3w-armv8.S +++ b/crypto/blst_src/build/coff/div3w-armv8.S @@ -27,7 +27,7 @@ div_3_limbs: asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + specilative bit + add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit diff --git a/crypto/blst_src/build/coff/div3w-x86_64.s b/crypto/blst_src/build/coff/div3w-x86_64.s index fcfe54480be..033d1eb3055 100644 --- a/crypto/blst_src/build/coff/div3w-x86_64.s +++ b/crypto/blst_src/build/coff/div3w-x86_64.s @@ -8,11 +8,14 @@ div_3_limbs: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) - movq %rsp,%rax + movq %rsp,%r11 .LSEH_begin_div_3_limbs: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx +.LSEH_body_div_3_limbs: movq (%rdi),%r8 movq 8(%rdi),%r9 @@ -45,9 +48,12 @@ div_3_limbs: orq %rcx,%rax - movq 8(%rsp),%rdi - movq 16(%rsp),%rsi +.LSEH_epilogue_div_3_limbs: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + .byte 0xf3,0xc3 + .LSEH_end_div_3_limbs: .globl quot_rem_128 @@ -57,11 +63,14 @@ quot_rem_128: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) - movq %rsp,%rax + movq %rsp,%r11 .LSEH_begin_quot_rem_128: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx +.LSEH_body_quot_rem_128: movq %rdx,%rax movq %rdx,%rcx @@ -97,9 +106,12 @@ quot_rem_128: movq %rcx,%rax - movq 8(%rsp),%rdi - movq 16(%rsp),%rsi +.LSEH_epilogue_quot_rem_128: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + .byte 0xf3,0xc3 + .LSEH_end_quot_rem_128: @@ -114,11 +126,14 @@ quot_rem_64: .byte 0xf3,0x0f,0x1e,0xfa movq %rdi,8(%rsp) movq %rsi,16(%rsp) - movq %rsp,%rax + movq %rsp,%r11 .LSEH_begin_quot_rem_64: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx +.LSEH_body_quot_rem_64: movq %rdx,%rax imulq 0(%rsi),%rdx @@ -130,11 +145,104 @@ quot_rem_64: movq %r10,0(%rdi) movq %rax,8(%rdi) - movq 8(%rsp),%rdi - movq 16(%rsp),%rsi +.LSEH_epilogue_quot_rem_64: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + .byte 0xf3,0xc3 + .LSEH_end_quot_rem_64: .section .pdata .p2align 2 +.rva .LSEH_begin_div_3_limbs +.rva .LSEH_body_div_3_limbs +.rva .LSEH_info_div_3_limbs_prologue + +.rva .LSEH_body_div_3_limbs +.rva .LSEH_epilogue_div_3_limbs +.rva .LSEH_info_div_3_limbs_body + +.rva .LSEH_epilogue_div_3_limbs +.rva .LSEH_end_div_3_limbs +.rva .LSEH_info_div_3_limbs_epilogue + +.rva .LSEH_begin_quot_rem_128 +.rva .LSEH_body_quot_rem_128 +.rva .LSEH_info_quot_rem_128_prologue + +.rva .LSEH_body_quot_rem_128 +.rva .LSEH_epilogue_quot_rem_128 +.rva .LSEH_info_quot_rem_128_body + +.rva .LSEH_epilogue_quot_rem_128 +.rva .LSEH_end_quot_rem_128 +.rva .LSEH_info_quot_rem_128_epilogue + +.rva .LSEH_begin_quot_rem_64 +.rva .LSEH_body_quot_rem_64 +.rva .LSEH_info_quot_rem_64_prologue + +.rva .LSEH_body_quot_rem_64 +.rva .LSEH_epilogue_quot_rem_64 +.rva .LSEH_info_quot_rem_64_body + +.rva .LSEH_epilogue_quot_rem_64 +.rva .LSEH_end_quot_rem_64 +.rva .LSEH_info_quot_rem_64_epilogue + .section .xdata .p2align 3 +.LSEH_info_div_3_limbs_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_div_3_limbs_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_div_3_limbs_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_quot_rem_128_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_quot_rem_128_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_quot_rem_128_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_quot_rem_64_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_quot_rem_64_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_quot_rem_64_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s index dd1e00fa301..2dd30bc5b5d 100644 --- a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s +++ b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s @@ -1,3 +1,4 @@ +.comm __blst_platform_cap,4 .text .globl mul_mont_sparse_256 @@ -10,13 +11,17 @@ mul_mont_sparse_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_mont_sparse_256: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_sparse_256$1 +#endif pushq %rbp pushq %rbx @@ -80,12 +85,16 @@ sqr_mont_sparse_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_mont_sparse_256: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_sparse_256$1 +#endif pushq %rbp pushq %rbx @@ -430,12 +439,16 @@ from_mont_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_from_mont_256: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz from_mont_256$1 +#endif pushq %rbp pushq %rbx @@ -510,12 +523,16 @@ redc_mont_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_redc_mont_256: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz redc_mont_256$1 +#endif pushq %rbp pushq %rbx @@ -778,8 +795,9 @@ __mulq_by_1_mont_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_mont_sparse_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -791,7 +809,8 @@ __mulq_by_1_mont_256: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_mont_sparse_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -802,8 +821,9 @@ __mulq_by_1_mont_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqr_mont_sparse_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -815,7 +835,8 @@ __mulq_by_1_mont_256: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_sparse_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -826,8 +847,9 @@ __mulq_by_1_mont_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_from_mont_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -839,7 +861,8 @@ __mulq_by_1_mont_256: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_from_mont_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -850,8 +873,9 @@ __mulq_by_1_mont_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_redc_mont_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -863,7 +887,8 @@ __mulq_by_1_mont_256: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_redc_mont_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s index 5663463524a..ee646f5b137 100644 --- a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s +++ b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s @@ -1,3 +1,4 @@ +.comm __blst_platform_cap,4 .text @@ -6,9 +7,9 @@ -.def __sub_mod_384x384; .scl 3; .type 32; .endef +.def __subq_mod_384x384; .scl 3; .type 32; .endef .p2align 5 -__sub_mod_384x384: +__subq_mod_384x384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 @@ -73,9 +74,9 @@ __sub_mod_384x384: .byte 0xf3,0xc3 -.def __add_mod_384; .scl 3; .type 32; .endef +.def __addq_mod_384; .scl 3; .type 32; .endef .p2align 5 -__add_mod_384: +__addq_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 @@ -123,9 +124,9 @@ __add_mod_384: .byte 0xf3,0xc3 -.def __sub_mod_384; .scl 3; .type 32; .endef +.def __subq_mod_384; .scl 3; .type 32; .endef .p2align 5 -__sub_mod_384: +__subq_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 @@ -135,7 +136,7 @@ __sub_mod_384: movq 32(%rsi),%r12 movq 40(%rsi),%r13 -__sub_mod_384_a_is_loaded: +__subq_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 @@ -182,13 +183,17 @@ mul_mont_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_mont_384x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_384x$1 +#endif pushq %rbp pushq %rbx @@ -229,12 +234,12 @@ mul_mont_384x: movq 8(%rsp),%rcx leaq -48(%rsi),%rdx leaq 40+192+48(%rsp),%rdi - call __add_mod_384 + call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi - call __add_mod_384 + call __addq_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi @@ -244,17 +249,17 @@ mul_mont_384x: leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 movq %rcx,%rbx @@ -263,14 +268,14 @@ mul_mont_384x: movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 @@ -304,12 +309,16 @@ sqr_mont_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_mont_384x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_384x$1 +#endif pushq %rbp pushq %rbx @@ -335,13 +344,13 @@ sqr_mont_384x: leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi - call __add_mod_384 + call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi - call __sub_mod_384 + call __subq_mod_384 movq 16(%rsp),%rsi @@ -433,12 +442,16 @@ mul_382x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_382x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_382x$1 +#endif pushq %rbp pushq %rbx @@ -528,18 +541,18 @@ mul_382x: leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 @@ -573,11 +586,15 @@ sqr_382x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_382x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_382x$1 +#endif pushq %rbp pushq %rbx @@ -628,7 +645,7 @@ sqr_382x: leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi - call __sub_mod_384_a_is_loaded + call __subq_mod_384_a_is_loaded leaq (%rdi),%rsi @@ -710,11 +727,15 @@ mul_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_384$1 +#endif pushq %rbp pushq %rbx @@ -1039,10 +1060,14 @@ sqr_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_384: - movq %rcx,%rdi - movq %rdx,%rsi + movq %rcx,%rdi + movq %rdx,%rsi +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_384$1 +#endif pushq %rbp pushq %rbx @@ -1286,12 +1311,16 @@ sqr_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_384$1 +#endif pushq %rbp pushq %rbx @@ -1321,7 +1350,7 @@ sqr_mont_384: movq 104(%rsp),%rbx movq 112(%rsp),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 120(%rsp),%r8 movq 120(%rsp),%r15 @@ -1358,12 +1387,16 @@ redc_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_redc_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz redc_mont_384$1 +#endif pushq %rbp pushq %rbx @@ -1383,7 +1416,7 @@ redc_mont_384: movq %rdx,%rbx call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 movq 8(%rsp),%r15 @@ -1420,12 +1453,16 @@ from_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_from_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz from_mont_384$1 +#endif pushq %rbp pushq %rbx @@ -1795,9 +1832,9 @@ __mulq_by_1_mont_384: .byte 0xf3,0xc3 -.def __redc_tail_mont_384; .scl 3; .type 32; .endef +.def __redq_tail_mont_384; .scl 3; .type 32; .endef .p2align 5 -__redc_tail_mont_384: +__redq_tail_mont_384: .byte 0xf3,0x0f,0x1e,0xfa addq 48(%rsi),%r14 @@ -1852,11 +1889,15 @@ sgn0_pty_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0_pty_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sgn0_pty_mont_384$1 +#endif pushq %rbp pushq %rbx @@ -1934,11 +1975,15 @@ sgn0_pty_mont_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0_pty_mont_384x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sgn0_pty_mont_384x$1 +#endif pushq %rbp pushq %rbx @@ -2065,13 +2110,17 @@ mul_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mul_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_384$1 +#endif pushq %rbp pushq %rbx @@ -2733,14 +2782,18 @@ sqr_n_mul_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_n_mul_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 movq 48(%rsp),%r9 - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_n_mul_mont_384$1 +#endif pushq %rbp pushq %rbx @@ -2774,7 +2827,7 @@ sqr_n_mul_mont_384: movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 movd %xmm1,%edx leaq 0(%rdi),%rsi @@ -2828,14 +2881,18 @@ sqr_n_mul_mont_383: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_n_mul_mont_383: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 movq 48(%rsp),%r9 - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_n_mul_mont_383$1 +#endif pushq %rbp pushq %rbx @@ -3494,12 +3551,16 @@ sqr_mont_382x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqr_mont_382x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_382x$1 +#endif pushq %rbp pushq %rbx @@ -3858,8 +3919,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_mont_384x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x29,0x00 @@ -3871,6 +3933,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x30,0x00 .byte 0x00,0x64,0x31,0x00 .byte 0x00,0x01,0x2f,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3881,8 +3945,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqr_mont_384x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 @@ -3894,6 +3959,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3904,8 +3971,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_382x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 @@ -3917,6 +3985,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3927,8 +3997,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqr_382x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -3940,7 +4011,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3951,8 +4023,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_384_body: .byte 1,0,11,0 .byte 0x00,0xc4,0x00,0x00 @@ -3972,8 +4045,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqr_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -3985,7 +4059,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3996,8 +4071,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqr_mont_384_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x0f,0x00 @@ -4009,6 +4085,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x16,0x00 .byte 0x00,0x64,0x17,0x00 .byte 0x00,0x01,0x15,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -4019,8 +4097,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_redc_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -4032,7 +4111,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_redc_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -4043,8 +4123,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_from_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -4056,7 +4137,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_from_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -4067,8 +4149,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sgn0_pty_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -4080,7 +4163,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -4091,8 +4175,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sgn0_pty_mont_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -4104,7 +4189,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0_pty_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -4115,8 +4201,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mul_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 @@ -4128,7 +4215,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mul_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -4139,8 +4227,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqr_n_mul_mont_384_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 @@ -4152,6 +4241,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_n_mul_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -4162,8 +4253,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqr_n_mul_mont_383_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 @@ -4175,6 +4267,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_n_mul_mont_383_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -4185,8 +4279,9 @@ sqr_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqr_mont_382x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 @@ -4198,6 +4293,8 @@ sqr_mont_382x: .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqr_mont_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s index 75c7e82bc1a..cba65569c52 100644 --- a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s +++ b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s @@ -10,13 +10,14 @@ mulx_mont_sparse_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_mont_sparse_256: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 - - +mul_mont_sparse_256$1: pushq %rbp pushq %rbx @@ -78,12 +79,13 @@ sqrx_mont_sparse_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_mont_sparse_256: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +sqr_mont_sparse_256$1: pushq %rbp pushq %rbx @@ -342,12 +344,13 @@ fromx_mont_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_fromx_mont_256: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +from_mont_256$1: pushq %rbp pushq %rbx @@ -422,12 +425,13 @@ redcx_mont_256: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_redcx_mont_256: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +redc_mont_256$1: pushq %rbp pushq %rbx @@ -690,8 +694,9 @@ __mulx_by_1_mont_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mulx_mont_sparse_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -703,7 +708,8 @@ __mulx_by_1_mont_256: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_mont_sparse_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -714,8 +720,9 @@ __mulx_by_1_mont_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqrx_mont_sparse_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -727,7 +734,8 @@ __mulx_by_1_mont_256: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_sparse_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -738,8 +746,9 @@ __mulx_by_1_mont_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_fromx_mont_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -751,7 +760,8 @@ __mulx_by_1_mont_256: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_fromx_mont_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -762,8 +772,9 @@ __mulx_by_1_mont_256: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_redcx_mont_256_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -775,7 +786,8 @@ __mulx_by_1_mont_256: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_redcx_mont_256_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s index 12306a7ff5c..ce1354f46b4 100644 --- a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s +++ b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s @@ -6,9 +6,9 @@ -.def __sub_mod_384x384; .scl 3; .type 32; .endef +.def __subx_mod_384x384; .scl 3; .type 32; .endef .p2align 5 -__sub_mod_384x384: +__subx_mod_384x384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 @@ -73,9 +73,9 @@ __sub_mod_384x384: .byte 0xf3,0xc3 -.def __add_mod_384; .scl 3; .type 32; .endef +.def __addx_mod_384; .scl 3; .type 32; .endef .p2align 5 -__add_mod_384: +__addx_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 @@ -123,9 +123,9 @@ __add_mod_384: .byte 0xf3,0xc3 -.def __sub_mod_384; .scl 3; .type 32; .endef +.def __subx_mod_384; .scl 3; .type 32; .endef .p2align 5 -__sub_mod_384: +__subx_mod_384: .byte 0xf3,0x0f,0x1e,0xfa movq 0(%rsi),%r8 @@ -135,7 +135,7 @@ __sub_mod_384: movq 32(%rsi),%r12 movq 40(%rsi),%r13 -__sub_mod_384_a_is_loaded: +__subx_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 @@ -182,13 +182,14 @@ mulx_mont_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_mont_384x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 - - +mul_mont_384x$1: pushq %rbp pushq %rbx @@ -230,12 +231,12 @@ mulx_mont_384x: leaq (%rbx),%rsi leaq -48(%rbx),%rdx leaq 40+192+48(%rsp),%rdi - call __add_mod_384 + call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi - call __add_mod_384 + call __addx_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi @@ -245,17 +246,17 @@ mulx_mont_384x: leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq (%rcx),%rbx @@ -264,14 +265,14 @@ mulx_mont_384x: movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 @@ -305,12 +306,13 @@ sqrx_mont_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_mont_384x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +sqr_mont_384x$1: pushq %rbp pushq %rbx @@ -337,13 +339,13 @@ sqrx_mont_384x: leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi - call __add_mod_384 + call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi - call __sub_mod_384 + call __subx_mod_384 movq 24(%rsp),%rsi @@ -445,12 +447,13 @@ mulx_382x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_382x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +mul_382x$1: pushq %rbp pushq %rbx @@ -540,18 +543,18 @@ mulx_382x: leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 @@ -585,11 +588,12 @@ sqrx_382x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_382x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +sqr_382x$1: pushq %rbp pushq %rbx @@ -640,7 +644,7 @@ sqrx_382x: leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi - call __sub_mod_384_a_is_loaded + call __subx_mod_384_a_is_loaded leaq (%rdi),%rsi @@ -722,11 +726,12 @@ mulx_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +mul_384$1: pushq %rbp pushq %rbx @@ -950,10 +955,11 @@ sqrx_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_384: - movq %rcx,%rdi - movq %rdx,%rsi + movq %rcx,%rdi + movq %rdx,%rsi +sqr_384$1: pushq %rbp pushq %rbx @@ -1145,12 +1151,13 @@ redcx_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_redcx_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +redc_mont_384$1: pushq %rbp pushq %rbx @@ -1170,7 +1177,7 @@ redcx_mont_384: movq %rdx,%rbx call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 movq 8(%rsp),%r15 @@ -1207,12 +1214,13 @@ fromx_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_fromx_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +from_mont_384$1: pushq %rbp pushq %rbx @@ -1473,9 +1481,9 @@ __mulx_by_1_mont_384: .byte 0xf3,0xc3 -.def __redc_tail_mont_384; .scl 3; .type 32; .endef +.def __redx_tail_mont_384; .scl 3; .type 32; .endef .p2align 5 -__redc_tail_mont_384: +__redx_tail_mont_384: .byte 0xf3,0x0f,0x1e,0xfa addq 48(%rsi),%r14 @@ -1530,11 +1538,12 @@ sgn0x_pty_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0x_pty_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +sgn0_pty_mont_384$1: pushq %rbp pushq %rbx @@ -1612,11 +1621,12 @@ sgn0x_pty_mont_384x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sgn0x_pty_mont_384x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +sgn0_pty_mont_384x$1: pushq %rbp pushq %rbx @@ -1743,13 +1753,14 @@ mulx_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_mulx_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 - - +mul_mont_384$1: pushq %rbp pushq %rbx @@ -2215,12 +2226,13 @@ sqrx_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +sqr_mont_384$1: pushq %rbp pushq %rbx @@ -2287,14 +2299,15 @@ sqrx_n_mul_mont_384: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_n_mul_mont_384: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 movq 48(%rsp),%r9 - - +sqr_n_mul_mont_384$1: pushq %rbp pushq %rbx @@ -2379,14 +2392,15 @@ sqrx_n_mul_mont_383: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_n_mul_mont_383: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx movq 40(%rsp),%r8 movq 48(%rsp),%r9 - - +sqr_n_mul_mont_383$1: pushq %rbp pushq %rbx @@ -2831,12 +2845,13 @@ sqrx_mont_382x: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_sqrx_mont_382x: + + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx movq %r9,%rcx - - +sqr_mont_382x$1: pushq %rbp pushq %rbx @@ -3205,8 +3220,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mulx_mont_384x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x29,0x00 @@ -3218,6 +3234,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x30,0x00 .byte 0x00,0x64,0x31,0x00 .byte 0x00,0x01,0x2f,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3228,8 +3246,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqrx_mont_384x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 @@ -3241,6 +3260,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3251,8 +3272,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mulx_382x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 @@ -3264,6 +3286,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3274,8 +3298,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqrx_382x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -3287,7 +3312,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3298,8 +3324,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mulx_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x00,0x00 @@ -3311,7 +3338,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x07,0x00 .byte 0x00,0x64,0x08,0x00 .byte 0x00,0x52 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3322,8 +3350,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqrx_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -3335,7 +3364,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3346,8 +3376,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_redcx_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -3359,7 +3390,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_redcx_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3370,8 +3402,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_fromx_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -3383,7 +3416,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_fromx_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3394,8 +3428,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sgn0x_pty_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -3407,7 +3442,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0x_pty_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3418,8 +3454,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sgn0x_pty_mont_384x_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x01,0x00 @@ -3431,7 +3468,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x08,0x00 .byte 0x00,0x64,0x09,0x00 .byte 0x00,0x62 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sgn0x_pty_mont_384x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3442,8 +3480,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_mulx_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 @@ -3455,7 +3494,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_mulx_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3466,8 +3506,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqrx_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x03,0x00 @@ -3479,7 +3520,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x0a,0x00 .byte 0x00,0x64,0x0b,0x00 .byte 0x00,0x82 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3490,8 +3532,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqrx_n_mul_mont_384_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x05,0x00 @@ -3503,7 +3546,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x0c,0x00 .byte 0x00,0x64,0x0d,0x00 .byte 0x00,0xa2 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_n_mul_mont_384_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3514,8 +3558,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqrx_n_mul_mont_383_body: .byte 1,0,17,0 .byte 0x00,0xf4,0x05,0x00 @@ -3527,7 +3572,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x0c,0x00 .byte 0x00,0x64,0x0d,0x00 .byte 0x00,0xa2 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_n_mul_mont_383_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 @@ -3538,8 +3584,9 @@ sqrx_mont_382x: .byte 1,0,5,0x0b .byte 0,0x74,1,0 .byte 0,0x64,2,0 -.byte 0,0x03 +.byte 0,0xb3 .byte 0,0 +.long 0,0 .LSEH_info_sqrx_mont_382x_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x11,0x00 @@ -3551,6 +3598,8 @@ sqrx_mont_382x: .byte 0x00,0x74,0x18,0x00 .byte 0x00,0x64,0x19,0x00 .byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_sqrx_mont_382x_epilogue: .byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 diff --git a/crypto/blst_src/build/coff/sha256-armv8.S b/crypto/blst_src/build/coff/sha256-armv8.S index a8bcbd3631b..a4cd8090896 100644 --- a/crypto/blst_src/build/coff/sha256-armv8.S +++ b/crypto/blst_src/build/coff/sha256-armv8.S @@ -10,11 +10,12 @@ // // sha256_block procedure for ARMv8. // -// This module is stripped of scalar code paths, with raionale that all +// This module is stripped of scalar code paths, with rationale that all // known processors are NEON-capable. // // See original module at CRYPTOGAMS for further details. +.comm __blst_platform_cap,4 .text .p2align 6 @@ -188,6 +189,11 @@ blst_sha256_block_armv8: .endef .p2align 4 blst_sha256_block_data_order: + adrp x16,__blst_platform_cap + ldr w16,[x16,#:lo12:__blst_platform_cap] + tst w16,#1 + b.ne .Lv8_entry + stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 diff --git a/crypto/blst_src/build/coff/sha256-portable-x86_64.s b/crypto/blst_src/build/coff/sha256-portable-x86_64.s index e499d107c70..603e46c53d7 100644 --- a/crypto/blst_src/build/coff/sha256-portable-x86_64.s +++ b/crypto/blst_src/build/coff/sha256-portable-x86_64.s @@ -1,3 +1,4 @@ +.comm __blst_platform_cap,4 .text .globl blst_sha256_block_data_order @@ -9,15 +10,21 @@ blst_sha256_block_data_order: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_blst_sha256_block_data_order: + + + pushq %rbp + + movq %rsp,%rbp + movq %rcx,%rdi movq %rdx,%rsi movq %r8,%rdx - - +#ifdef __BLST_PORTABLE__ + testl $2,__blst_platform_cap(%rip) + jnz .Lblst_sha256_block_data_order$2 +#endif pushq %rbx - pushq %rbp - pushq %r12 pushq %r13 @@ -29,12 +36,13 @@ blst_sha256_block_data_order: shlq $4,%rdx subq $64+24,%rsp + +.LSEH_body_blst_sha256_block_data_order: + leaq (%rsi,%rdx,4),%rdx movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) -.LSEH_body_blst_sha256_block_data_order: - movl 0(%rdi),%eax movl 4(%rdi),%ebx @@ -1637,17 +1645,11 @@ blst_sha256_block_data_order: leaq 64+24+48(%rsp),%r11 movq 64+24(%rsp),%r15 - movq -40(%r11),%r14 - movq -32(%r11),%r13 - movq -24(%r11),%r12 - - movq -16(%r11),%rbp - - movq -8(%r11),%rbx - + movq -16(%r11),%rbx + movq -8(%r11),%rbp .LSEH_epilogue_blst_sha256_block_data_order: mov 8(%r11),%rdi mov 16(%r11),%rsi @@ -1657,6 +1659,7 @@ blst_sha256_block_data_order: .LSEH_end_blst_sha256_block_data_order: +#ifndef __BLST_PORTABLE__ .p2align 6 K256: @@ -1742,6 +1745,7 @@ blst_sha256_hcopy: movq %r11,24(%rcx) .byte 0xf3,0xc3 +#endif .section .pdata .p2align 2 .rva .LSEH_begin_blst_sha256_block_data_order @@ -1759,26 +1763,30 @@ blst_sha256_hcopy: .section .xdata .p2align 3 .LSEH_info_blst_sha256_block_data_order_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0x03 -.byte 0,0 +.byte 1,4,6,0x05 +.byte 4,0x74,2,0 +.byte 4,0x64,3,0 +.byte 4,0x53 +.byte 1,0x50 +.long 0,0 .LSEH_info_blst_sha256_block_data_order_body: .byte 1,0,18,0 .byte 0x00,0xf4,0x0b,0x00 .byte 0x00,0xe4,0x0c,0x00 .byte 0x00,0xd4,0x0d,0x00 .byte 0x00,0xc4,0x0e,0x00 -.byte 0x00,0x54,0x0f,0x00 -.byte 0x00,0x34,0x10,0x00 +.byte 0x00,0x34,0x0f,0x00 +.byte 0x00,0x54,0x10,0x00 .byte 0x00,0x74,0x12,0x00 .byte 0x00,0x64,0x13,0x00 .byte 0x00,0x01,0x11,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_blst_sha256_block_data_order_epilogue: .byte 1,0,5,11 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x03 -.byte 0x00,0x00 +.byte 0x00,0xb3 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 diff --git a/crypto/blst_src/build/coff/sha256-x86_64.s b/crypto/blst_src/build/coff/sha256-x86_64.s index ed28b781d4c..d65df5d0d4d 100644 --- a/crypto/blst_src/build/coff/sha256-x86_64.s +++ b/crypto/blst_src/build/coff/sha256-x86_64.s @@ -1,3 +1,4 @@ +.comm __blst_platform_cap,4 .text .p2align 6 @@ -34,22 +35,23 @@ blst_sha256_block_data_order_shaext: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_blst_sha256_block_data_order_shaext: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - - - subq $0x58,%rsp - movaps %xmm6,-88(%r11) - movaps %xmm7,-72(%r11) + pushq %rbp - movaps %xmm8,-56(%r11) + movq %rsp,%rbp - movaps %xmm9,-40(%r11) + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +.Lblst_sha256_block_data_order$2: + subq $0x50,%rsp - movaps %xmm10,-24(%r11) + movaps %xmm6,-80(%rbp) + movaps %xmm7,-64(%rbp) + movaps %xmm8,-48(%rbp) + movaps %xmm9,-32(%rbp) + movaps %xmm10,-16(%rbp) .LSEH_body_blst_sha256_block_data_order_shaext: @@ -254,16 +256,18 @@ blst_sha256_block_data_order_shaext: movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) - movaps -88(%r11),%xmm6 - movaps -72(%r11),%xmm7 - movaps -56(%r11),%xmm8 - movaps -40(%r11),%xmm9 - movaps -24(%r11),%xmm10 - movq %r11,%rsp + movaps -80(%rbp),%xmm6 + movaps -64(%rbp),%xmm7 + movaps -48(%rbp),%xmm8 + movaps -32(%rbp),%xmm9 + movaps -16(%rbp),%xmm10 + movq %rbp,%rsp + + popq %rbp .LSEH_epilogue_blst_sha256_block_data_order_shaext: - mov 8(%r11),%rdi - mov 16(%r11),%rsi + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi .byte 0xf3,0xc3 @@ -278,13 +282,17 @@ blst_sha256_block_data_order: movq %rsi,16(%rsp) movq %rsp,%r11 .LSEH_begin_blst_sha256_block_data_order: - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx pushq %rbp + movq %rsp,%rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + testl $2,__blst_platform_cap(%rip) + jnz .Lblst_sha256_block_data_order$2 pushq %rbx pushq %r12 @@ -296,21 +304,16 @@ blst_sha256_block_data_order: pushq %r15 shlq $4,%rdx - subq $104,%rsp + subq $88,%rsp leaq (%rsi,%rdx,4),%rdx - movq %rdi,0(%rsp) - - movq %rdx,16(%rsp) - movaps %xmm6,32(%rsp) - - movaps %xmm7,48(%rsp) + movq %rdi,-64(%rbp) - movaps %xmm8,64(%rsp) - - movaps %xmm9,80(%rsp) - - movq %rsp,%rbp + movq %rdx,-48(%rbp) + movaps %xmm6,-128(%rbp) + movaps %xmm7,-112(%rbp) + movaps %xmm8,-96(%rbp) + movaps %xmm9,-80(%rbp) .LSEH_body_blst_sha256_block_data_order: @@ -331,7 +334,7 @@ blst_sha256_block_data_order: .p2align 4 .Lloop_ssse3: movdqa K256+256(%rip),%xmm7 - movq %rsi,8(%rbp) + movq %rsi,-56(%rbp) movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 @@ -1356,9 +1359,9 @@ blst_sha256_block_data_order: addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d - movq 0(%rbp),%rdi + movq -64(%rbp),%rdi movl %r14d,%eax - movq 8(%rbp),%rsi + movq -56(%rbp),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx @@ -1370,7 +1373,7 @@ blst_sha256_block_data_order: addl 28(%rdi),%r11d leaq 64(%rsi),%rsi - cmpq 16(%rbp),%rsi + cmpq -48(%rbp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) @@ -1383,33 +1386,27 @@ blst_sha256_block_data_order: jb .Lloop_ssse3 xorps %xmm0,%xmm0 - leaq 104+48(%rbp),%r11 - movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) movaps %xmm0,48(%rsp) - movaps 32(%rbp),%xmm6 - movaps 48(%rbp),%xmm7 - movaps 64(%rbp),%xmm8 - movaps 80(%rbp),%xmm9 - movq 104(%rbp),%r15 - - movq -40(%r11),%r14 - - movq -32(%r11),%r13 - - movq -24(%r11),%r12 - - movq -16(%r11),%rbx - - movq -8(%r11),%rbp + movaps -128(%rbp),%xmm6 + movaps -112(%rbp),%xmm7 + movaps -96(%rbp),%xmm8 + movaps -80(%rbp),%xmm9 + movq -40(%rbp),%r15 + movq -32(%rbp),%r14 + movq -24(%rbp),%r13 + movq -16(%rbp),%r12 + movq -8(%rbp),%rbx + movq %rbp,%rsp + + popq %rbp .LSEH_epilogue_blst_sha256_block_data_order: - mov 8(%r11),%rdi - mov 16(%r11),%rsi + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi - leaq (%r11),%rsp .byte 0xf3,0xc3 .LSEH_end_blst_sha256_block_data_order: @@ -1506,13 +1503,14 @@ blst_sha256_hcopy: .section .xdata .p2align 3 .LSEH_info_blst_sha256_block_data_order_shaext_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0x03 -.byte 0,0 +.byte 1,4,6,0x05 +.byte 4,0x74,2,0 +.byte 4,0x64,3,0 +.byte 4,0x53 +.byte 1,0x50 +.long 0,0 .LSEH_info_blst_sha256_block_data_order_shaext_body: -.byte 1,0,15,0 +.byte 1,0,17,85 .byte 0x00,0x68,0x00,0x00 .byte 0x00,0x78,0x01,0x00 .byte 0x00,0x88,0x02,0x00 @@ -1520,41 +1518,45 @@ blst_sha256_hcopy: .byte 0x00,0xa8,0x04,0x00 .byte 0x00,0x74,0x0c,0x00 .byte 0x00,0x64,0x0d,0x00 -.byte 0x00,0xa2 +.byte 0x00,0x53 +.byte 0x00,0x92 +.byte 0x00,0x50 .byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_blst_sha256_block_data_order_shaext_epilogue: -.byte 1,0,5,11 +.byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x03 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_blst_sha256_block_data_order_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0x03 -.byte 0,0 +.byte 1,4,6,0x05 +.byte 4,0x74,2,0 +.byte 4,0x64,3,0 +.byte 4,0x53 +.byte 1,0x50 +.long 0,0 .LSEH_info_blst_sha256_block_data_order_body: -.byte 1,0,26,5 -.byte 0x00,0x68,0x02,0x00 -.byte 0x00,0x78,0x03,0x00 -.byte 0x00,0x88,0x04,0x00 -.byte 0x00,0x98,0x05,0x00 -.byte 0x00,0xf4,0x0d,0x00 -.byte 0x00,0xe4,0x0e,0x00 -.byte 0x00,0xd4,0x0f,0x00 -.byte 0x00,0xc4,0x10,0x00 -.byte 0x00,0x34,0x11,0x00 -.byte 0x00,0x74,0x14,0x00 -.byte 0x00,0x64,0x15,0x00 -.byte 0x00,0x03 -.byte 0x00,0x01,0x12,0x00 +.byte 1,0,25,133 +.byte 0x00,0x68,0x00,0x00 +.byte 0x00,0x78,0x01,0x00 +.byte 0x00,0x88,0x02,0x00 +.byte 0x00,0x98,0x03,0x00 +.byte 0x00,0xf4,0x0b,0x00 +.byte 0x00,0xe4,0x0c,0x00 +.byte 0x00,0xd4,0x0d,0x00 +.byte 0x00,0xc4,0x0e,0x00 +.byte 0x00,0x34,0x0f,0x00 +.byte 0x00,0x74,0x12,0x00 +.byte 0x00,0x64,0x13,0x00 +.byte 0x00,0x53 +.byte 0x00,0xf2 .byte 0x00,0x50 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 .LSEH_info_blst_sha256_block_data_order_epilogue: -.byte 1,0,5,11 +.byte 1,0,4,0 .byte 0x00,0x74,0x01,0x00 .byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x03 -.byte 0x00,0x00 +.byte 0x00,0x00,0x00,0x00 diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S index 347eb315f40..0c5ac5b882d 100644 --- a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S +++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S @@ -1,6 +1,7 @@ .text .globl ct_inverse_mod_256 +.hidden ct_inverse_mod_256 .type ct_inverse_mod_256, %function .align 5 ct_inverse_mod_256: @@ -60,14 +61,14 @@ ct_inverse_mod_256: madd x4, x16, x8, xzr // |u|*|f0| madd x4, x17, x9, x4 // |v|*|g0| str x4, [x0,#8*4] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*5] stp x5, x5, [x0,#8*7] madd x4, x12, x8, xzr // |u|*|f1| madd x4, x13, x9, x4 // |v|*|g1| str x4, [x0,#8*9] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] eor x1, x1, #256 // flip-flop src |a|b|u|v| diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s index c4d8d6d3700..0f0ca4923d7 100644 --- a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s +++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s @@ -1,6 +1,7 @@ .text .globl ct_inverse_mod_256 +.hidden ct_inverse_mod_256 .type ct_inverse_mod_256,@function .align 32 ct_inverse_mod_256: diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S index d7eca17073c..99bb9def767 100644 --- a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S +++ b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S @@ -1,6 +1,7 @@ .text .globl ct_inverse_mod_383 +.hidden ct_inverse_mod_383 .type ct_inverse_mod_383, %function .align 5 ct_inverse_mod_383: @@ -71,7 +72,7 @@ ct_inverse_mod_383: adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] @@ -82,7 +83,7 @@ ct_inverse_mod_383: adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*14] stp x5, x5, [x0,#8*16] eor x1, x1, #256 // flip-flop src |a|b|u|v| diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S index 3f1390ed9dc..07dd99a8af3 100644 --- a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S +++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S @@ -1,6 +1,7 @@ .text .globl ct_is_square_mod_384 +.hidden ct_is_square_mod_384 .type ct_is_square_mod_384, %function .align 5 ct_is_square_mod_384: diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s index fec1493cb12..bf610fa7440 100644 --- a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s +++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s @@ -1,6 +1,7 @@ .text .globl ct_is_square_mod_384 +.hidden ct_is_square_mod_384 .type ct_is_square_mod_384,@function .align 32 ct_is_square_mod_384: diff --git a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s index b702262f6e5..9cca518721f 100644 --- a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s +++ b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s @@ -1,6 +1,8 @@ +.comm __blst_platform_cap,4 .text .globl ct_inverse_mod_383 +.hidden ct_inverse_mod_383 .type ct_inverse_mod_383,@function .align 32 ct_inverse_mod_383: @@ -8,6 +10,10 @@ ct_inverse_mod_383: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz ct_inverse_mod_383$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s index 25a5fa5345f..9f4d12babd4 100644 --- a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s +++ b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s @@ -1,6 +1,7 @@ .text .globl ctx_inverse_mod_383 +.hidden ctx_inverse_mod_383 .type ctx_inverse_mod_383,@function .align 32 ctx_inverse_mod_383: @@ -8,6 +9,7 @@ ctx_inverse_mod_383: .byte 0xf3,0x0f,0x1e,0xfa +ct_inverse_mod_383$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -810,7 +812,7 @@ ctx_inverse_mod_383: movq 48(%rsi),%r10 - call __inner_loop_62 + call __tail_loop_53 @@ -1521,9 +1523,9 @@ __inner_loop_31: .cfi_endproc .size __inner_loop_31,.-__inner_loop_31 -.type __inner_loop_62,@function +.type __tail_loop_53,@function .align 32 -__inner_loop_62: +__tail_loop_53: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -1532,7 +1534,7 @@ __inner_loop_62: xorq %r12,%r12 movq $1,%r13 -.Loop_62: +.Loop_53: xorq %rax,%rax testq $1,%r8 movq %r10,%rbx @@ -1559,11 +1561,11 @@ __inner_loop_62: subq %rax,%rdx subq %rbx,%rcx subl $1,%edi - jnz .Loop_62 + jnz .Loop_53 .byte 0xf3,0xc3 .cfi_endproc -.size __inner_loop_62,.-__inner_loop_62 +.size __tail_loop_53,.-__tail_loop_53 .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note diff --git a/crypto/blst_src/build/elf/div3w-armv8.S b/crypto/blst_src/build/elf/div3w-armv8.S index a2b1d676a36..37621bee415 100644 --- a/crypto/blst_src/build/elf/div3w-armv8.S +++ b/crypto/blst_src/build/elf/div3w-armv8.S @@ -25,7 +25,7 @@ div_3_limbs: asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + specilative bit + add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit diff --git a/crypto/blst_src/build/elf/div3w-x86_64.s b/crypto/blst_src/build/elf/div3w-x86_64.s index 00ae5699824..5d9fd8a9139 100644 --- a/crypto/blst_src/build/elf/div3w-x86_64.s +++ b/crypto/blst_src/build/elf/div3w-x86_64.s @@ -8,6 +8,8 @@ div_3_limbs: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa + + movq (%rdi),%r8 movq 8(%rdi),%r9 xorq %rax,%rax @@ -39,8 +41,9 @@ div_3_limbs: orq %rcx,%rax + .byte 0xf3,0xc3 -.cfi_endproc +.cfi_endproc .size div_3_limbs,.-div_3_limbs .globl quot_rem_128 .hidden quot_rem_128 @@ -50,6 +53,8 @@ quot_rem_128: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax movq %rdx,%rcx @@ -84,8 +89,9 @@ quot_rem_128: movq %rcx,%rax + .byte 0xf3,0xc3 -.cfi_endproc +.cfi_endproc .size quot_rem_128,.-quot_rem_128 @@ -100,6 +106,8 @@ quot_rem_64: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax imulq 0(%rsi),%rdx @@ -110,8 +118,9 @@ quot_rem_64: movq %r10,0(%rdi) movq %rax,8(%rdi) + .byte 0xf3,0xc3 -.cfi_endproc +.cfi_endproc .size quot_rem_64,.-quot_rem_64 .section .note.GNU-stack,"",@progbits diff --git a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s index 37abd4392d3..10b1b56cb50 100644 --- a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s +++ b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s @@ -1,3 +1,4 @@ +.comm __blst_platform_cap,4 .text .globl mul_mont_sparse_256 @@ -9,6 +10,10 @@ mul_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_sparse_256$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -73,6 +78,10 @@ sqr_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_sparse_256$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -419,6 +428,10 @@ from_mont_256: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz from_mont_256$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -494,6 +507,10 @@ redc_mont_256: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz redc_mont_256$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s index fa9dd3529ad..903ba23b12c 100644 --- a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s +++ b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s @@ -1,3 +1,4 @@ +.comm __blst_platform_cap,4 .text @@ -6,9 +7,9 @@ -.type __sub_mod_384x384,@function +.type __subq_mod_384x384,@function .align 32 -__sub_mod_384x384: +__subq_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -73,11 +74,11 @@ __sub_mod_384x384: .byte 0xf3,0xc3 .cfi_endproc -.size __sub_mod_384x384,.-__sub_mod_384x384 +.size __subq_mod_384x384,.-__subq_mod_384x384 -.type __add_mod_384,@function +.type __addq_mod_384,@function .align 32 -__add_mod_384: +__addq_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -125,11 +126,11 @@ __add_mod_384: .byte 0xf3,0xc3 .cfi_endproc -.size __add_mod_384,.-__add_mod_384 +.size __addq_mod_384,.-__addq_mod_384 -.type __sub_mod_384,@function +.type __subq_mod_384,@function .align 32 -__sub_mod_384: +__subq_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -140,7 +141,7 @@ __sub_mod_384: movq 32(%rsi),%r12 movq 40(%rsi),%r13 -__sub_mod_384_a_is_loaded: +__subq_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 @@ -177,7 +178,7 @@ __sub_mod_384_a_is_loaded: .byte 0xf3,0xc3 .cfi_endproc -.size __sub_mod_384,.-__sub_mod_384 +.size __subq_mod_384,.-__subq_mod_384 .globl mul_mont_384x .hidden mul_mont_384x .type mul_mont_384x,@function @@ -187,6 +188,10 @@ mul_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_384x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -232,12 +237,12 @@ mul_mont_384x: movq 8(%rsp),%rcx leaq -48(%rsi),%rdx leaq 40+192+48(%rsp),%rdi - call __add_mod_384 + call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi - call __add_mod_384 + call __addq_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi @@ -247,17 +252,17 @@ mul_mont_384x: leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 movq %rcx,%rbx @@ -266,14 +271,14 @@ mul_mont_384x: movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 @@ -303,6 +308,10 @@ sqr_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_384x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -333,13 +342,13 @@ sqr_mont_384x: leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi - call __add_mod_384 + call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi - call __sub_mod_384 + call __subq_mod_384 movq 16(%rsp),%rsi @@ -427,6 +436,10 @@ mul_382x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_382x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -521,18 +534,18 @@ mul_382x: leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 @@ -562,6 +575,10 @@ sqr_382x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_382x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -617,7 +634,7 @@ sqr_382x: leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi - call __sub_mod_384_a_is_loaded + call __subq_mod_384_a_is_loaded leaq (%rdi),%rsi @@ -695,6 +712,10 @@ mul_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1019,6 +1040,10 @@ sqr_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1265,6 +1290,10 @@ sqr_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1299,7 +1328,7 @@ sqr_mont_384: movq 104(%rsp),%rbx movq 112(%rsp),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 120(%rsp),%r8 movq 120(%rsp),%r15 @@ -1332,6 +1361,10 @@ redc_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz redc_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1356,7 +1389,7 @@ redc_mont_384: movq %rdx,%rbx call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 movq 8(%rsp),%r15 .cfi_restore %r15 @@ -1389,6 +1422,10 @@ from_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz from_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1762,9 +1799,9 @@ __mulq_by_1_mont_384: .cfi_endproc .size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 -.type __redc_tail_mont_384,@function +.type __redq_tail_mont_384,@function .align 32 -__redc_tail_mont_384: +__redq_tail_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -1809,7 +1846,7 @@ __redc_tail_mont_384: .byte 0xf3,0xc3 .cfi_endproc -.size __redc_tail_mont_384,.-__redc_tail_mont_384 +.size __redq_tail_mont_384,.-__redq_tail_mont_384 .globl sgn0_pty_mont_384 .hidden sgn0_pty_mont_384 @@ -1820,6 +1857,10 @@ sgn0_pty_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sgn0_pty_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1898,6 +1939,10 @@ sgn0_pty_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sgn0_pty_mont_384x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2025,6 +2070,10 @@ mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2689,6 +2738,10 @@ sqr_n_mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_n_mul_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2727,7 +2780,7 @@ sqr_n_mul_mont_384: movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 movd %xmm1,%edx leaq 0(%rdi),%rsi @@ -2777,6 +2830,10 @@ sqr_n_mul_mont_383: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_n_mul_mont_383$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3438,6 +3495,10 @@ sqr_mont_382x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_382x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s index 20a02073246..42e89134cff 100644 --- a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s +++ b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s @@ -9,6 +9,7 @@ mulx_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa +mul_mont_sparse_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -71,6 +72,7 @@ sqrx_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa +sqr_mont_sparse_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -332,6 +334,7 @@ fromx_mont_256: .byte 0xf3,0x0f,0x1e,0xfa +from_mont_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -407,6 +410,7 @@ redcx_mont_256: .byte 0xf3,0x0f,0x1e,0xfa +redc_mont_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s index 9f9f7404ee4..5c67d918d22 100644 --- a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s +++ b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s @@ -6,9 +6,9 @@ -.type __sub_mod_384x384,@function +.type __subx_mod_384x384,@function .align 32 -__sub_mod_384x384: +__subx_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -73,11 +73,11 @@ __sub_mod_384x384: .byte 0xf3,0xc3 .cfi_endproc -.size __sub_mod_384x384,.-__sub_mod_384x384 +.size __subx_mod_384x384,.-__subx_mod_384x384 -.type __add_mod_384,@function +.type __addx_mod_384,@function .align 32 -__add_mod_384: +__addx_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -125,11 +125,11 @@ __add_mod_384: .byte 0xf3,0xc3 .cfi_endproc -.size __add_mod_384,.-__add_mod_384 +.size __addx_mod_384,.-__addx_mod_384 -.type __sub_mod_384,@function +.type __subx_mod_384,@function .align 32 -__sub_mod_384: +__subx_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -140,7 +140,7 @@ __sub_mod_384: movq 32(%rsi),%r12 movq 40(%rsi),%r13 -__sub_mod_384_a_is_loaded: +__subx_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 @@ -177,7 +177,7 @@ __sub_mod_384_a_is_loaded: .byte 0xf3,0xc3 .cfi_endproc -.size __sub_mod_384,.-__sub_mod_384 +.size __subx_mod_384,.-__subx_mod_384 .globl mulx_mont_384x .hidden mulx_mont_384x .type mulx_mont_384x,@function @@ -187,6 +187,7 @@ mulx_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +mul_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -233,12 +234,12 @@ mulx_mont_384x: leaq (%rbx),%rsi leaq -48(%rbx),%rdx leaq 40+192+48(%rsp),%rdi - call __add_mod_384 + call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi - call __add_mod_384 + call __addx_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi @@ -248,17 +249,17 @@ mulx_mont_384x: leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq (%rcx),%rbx @@ -267,14 +268,14 @@ mulx_mont_384x: movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 @@ -304,6 +305,7 @@ sqrx_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +sqr_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -335,13 +337,13 @@ sqrx_mont_384x: leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi - call __add_mod_384 + call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi - call __sub_mod_384 + call __subx_mod_384 movq 24(%rsp),%rsi @@ -439,6 +441,7 @@ mulx_382x: .byte 0xf3,0x0f,0x1e,0xfa +mul_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -533,18 +536,18 @@ mulx_382x: leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 @@ -574,6 +577,7 @@ sqrx_382x: .byte 0xf3,0x0f,0x1e,0xfa +sqr_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -629,7 +633,7 @@ sqrx_382x: leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi - call __sub_mod_384_a_is_loaded + call __subx_mod_384_a_is_loaded leaq (%rdi),%rsi @@ -707,6 +711,7 @@ mulx_384: .byte 0xf3,0x0f,0x1e,0xfa +mul_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -933,6 +938,7 @@ sqrx_384: .byte 0xf3,0x0f,0x1e,0xfa +sqr_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1127,6 +1133,7 @@ redcx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +redc_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1151,7 +1158,7 @@ redcx_mont_384: movq %rdx,%rbx call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 movq 8(%rsp),%r15 .cfi_restore %r15 @@ -1184,6 +1191,7 @@ fromx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +from_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1448,9 +1456,9 @@ __mulx_by_1_mont_384: .cfi_endproc .size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 -.type __redc_tail_mont_384,@function +.type __redx_tail_mont_384,@function .align 32 -__redc_tail_mont_384: +__redx_tail_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -1495,7 +1503,7 @@ __redc_tail_mont_384: .byte 0xf3,0xc3 .cfi_endproc -.size __redc_tail_mont_384,.-__redc_tail_mont_384 +.size __redx_tail_mont_384,.-__redx_tail_mont_384 .globl sgn0x_pty_mont_384 .hidden sgn0x_pty_mont_384 @@ -1506,6 +1514,7 @@ sgn0x_pty_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +sgn0_pty_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1584,6 +1593,7 @@ sgn0x_pty_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +sgn0_pty_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1711,6 +1721,7 @@ mulx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +mul_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2178,6 +2189,7 @@ sqrx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +sqr_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2245,6 +2257,7 @@ sqrx_n_mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +sqr_n_mul_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2330,6 +2343,7 @@ sqrx_n_mul_mont_383: .byte 0xf3,0x0f,0x1e,0xfa +sqr_n_mul_mont_383$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2776,6 +2790,7 @@ sqrx_mont_382x: .byte 0xf3,0x0f,0x1e,0xfa +sqr_mont_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/elf/sha256-armv8.S b/crypto/blst_src/build/elf/sha256-armv8.S index 7341decf4f5..45c1162c467 100644 --- a/crypto/blst_src/build/elf/sha256-armv8.S +++ b/crypto/blst_src/build/elf/sha256-armv8.S @@ -10,11 +10,12 @@ // // sha256_block procedure for ARMv8. // -// This module is stripped of scalar code paths, with raionale that all +// This module is stripped of scalar code paths, with rationale that all // known processors are NEON-capable. // // See original module at CRYPTOGAMS for further details. +.comm __blst_platform_cap,4 .text .align 6 @@ -184,6 +185,11 @@ blst_sha256_block_armv8: .type blst_sha256_block_data_order,%function .align 4 blst_sha256_block_data_order: + adrp x16,__blst_platform_cap + ldr w16,[x16,#:lo12:__blst_platform_cap] + tst w16,#1 + b.ne .Lv8_entry + stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 diff --git a/crypto/blst_src/build/elf/sha256-portable-x86_64.s b/crypto/blst_src/build/elf/sha256-portable-x86_64.s index 20b5c411306..2fd6a770917 100644 --- a/crypto/blst_src/build/elf/sha256-portable-x86_64.s +++ b/crypto/blst_src/build/elf/sha256-portable-x86_64.s @@ -1,3 +1,4 @@ +.comm __blst_platform_cap,4 .text .globl blst_sha256_block_data_order @@ -8,33 +9,35 @@ blst_sha256_block_data_order: .byte 0xf3,0x0f,0x1e,0xfa - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp +#ifdef __BLST_PORTABLE__ + testl $2,__blst_platform_cap(%rip) + jnz .Lblst_sha256_block_data_order$2 +#endif + pushq %rbx +.cfi_offset %rbx,-24 pushq %r12 -.cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 -.cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 -.cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 -.cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 shlq $4,%rdx subq $64+24,%rsp -.cfi_adjust_cfa_offset 16*4+3*8 + +.cfi_def_cfa %rsp,144 + leaq (%rsi,%rdx,4),%rdx movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx @@ -1636,23 +1639,23 @@ blst_sha256_block_data_order: leaq 64+24+48(%rsp),%r11 .cfi_def_cfa %r11,8 movq 64+24(%rsp),%r15 -.cfi_restore %r15 movq -40(%r11),%r14 -.cfi_restore %r14 movq -32(%r11),%r13 -.cfi_restore %r13 movq -24(%r11),%r12 + movq -16(%r11),%rbx + movq -8(%r11),%rbp .cfi_restore %r12 - movq -16(%r11),%rbp +.cfi_restore %r13 +.cfi_restore %r14 +.cfi_restore %r15 .cfi_restore %rbp - movq -8(%r11),%rbx .cfi_restore %rbx - leaq (%r11),%rsp .byte 0xf3,0xc3 .cfi_endproc .size blst_sha256_block_data_order,.-blst_sha256_block_data_order +#ifndef __BLST_PORTABLE__ .align 64 .type K256,@object K256: @@ -1744,6 +1747,7 @@ blst_sha256_hcopy: .byte 0xf3,0xc3 .cfi_endproc .size blst_sha256_hcopy,.-blst_sha256_hcopy +#endif .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a",@note diff --git a/crypto/blst_src/build/elf/sha256-x86_64.s b/crypto/blst_src/build/elf/sha256-x86_64.s index 47fdc5bc57a..940051aab16 100644 --- a/crypto/blst_src/build/elf/sha256-x86_64.s +++ b/crypto/blst_src/build/elf/sha256-x86_64.s @@ -1,3 +1,4 @@ +.comm __blst_platform_cap,4 .text .align 64 @@ -33,6 +34,13 @@ blst_sha256_block_data_order_shaext: .byte 0xf3,0x0f,0x1e,0xfa + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp +.Lblst_sha256_block_data_order$2: + leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 @@ -234,6 +242,11 @@ blst_sha256_block_data_order_shaext: movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + .byte 0xf3,0xc3 .cfi_endproc .size blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext @@ -249,30 +262,27 @@ blst_sha256_block_data_order: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + testl $2,__blst_platform_cap(%rip) + jnz .Lblst_sha256_block_data_order$2 pushq %rbx -.cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 -.cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 -.cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 -.cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 -.cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 shlq $4,%rdx - subq $40,%rsp -.cfi_adjust_cfa_offset 40 + subq $24,%rsp + leaq (%rsi,%rdx,4),%rdx - movq %rdi,0(%rsp) + movq %rdi,-64(%rbp) - movq %rdx,16(%rsp) - movq %rsp,%rbp -.cfi_def_cfa_register %rbp + movq %rdx,-48(%rbp) leaq -64(%rsp),%rsp @@ -291,7 +301,7 @@ blst_sha256_block_data_order: .align 16 .Lloop_ssse3: movdqa K256+256(%rip),%xmm7 - movq %rsi,8(%rbp) + movq %rsi,-56(%rbp) movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 @@ -1316,9 +1326,9 @@ blst_sha256_block_data_order: addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d - movq 0(%rbp),%rdi + movq -64(%rbp),%rdi movl %r14d,%eax - movq 8(%rbp),%rsi + movq -56(%rbp),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx @@ -1330,7 +1340,7 @@ blst_sha256_block_data_order: addl 28(%rdi),%r11d leaq 64(%rsi),%rsi - cmpq 16(%rbp),%rsi + cmpq -48(%rbp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) @@ -1343,26 +1353,25 @@ blst_sha256_block_data_order: jb .Lloop_ssse3 xorps %xmm0,%xmm0 - leaq 40+48(%rbp),%r11 -.cfi_def_cfa %r11,8 movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) movaps %xmm0,48(%rsp) - movq 40(%rbp),%r15 -.cfi_restore %r15 - movq -40(%r11),%r14 -.cfi_restore %r14 - movq -32(%r11),%r13 -.cfi_restore %r13 - movq -24(%r11),%r12 + movq -40(%rbp),%r15 + movq -32(%rbp),%r14 + movq -24(%rbp),%r13 + movq -16(%rbp),%r12 + movq -8(%rbp),%rbx + movq %rbp,%rsp +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp .cfi_restore %r12 - movq -16(%r11),%rbx +.cfi_restore %r13 +.cfi_restore %r14 +.cfi_restore %r15 .cfi_restore %rbx - movq -8(%r11),%rbp -.cfi_restore %rbp - - leaq (%r11),%rsp .byte 0xf3,0xc3 .cfi_endproc .size blst_sha256_block_data_order,.-blst_sha256_block_data_order diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S index f3a2c3b5f11..2fd4847a496 100644 --- a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S @@ -1,6 +1,7 @@ .text .globl _ct_inverse_mod_256 +.private_extern _ct_inverse_mod_256 .align 5 _ct_inverse_mod_256: @@ -60,14 +61,14 @@ _ct_inverse_mod_256: madd x4, x16, x8, xzr // |u|*|f0| madd x4, x17, x9, x4 // |v|*|g0| str x4, [x0,#8*4] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*5] stp x5, x5, [x0,#8*7] madd x4, x12, x8, xzr // |u|*|f1| madd x4, x13, x9, x4 // |v|*|g1| str x4, [x0,#8*9] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] eor x1, x1, #256 // flip-flop src |a|b|u|v| diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s index b6441da6e1f..bf0ad8986e7 100644 --- a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s @@ -1,6 +1,7 @@ .text .globl _ct_inverse_mod_256 +.private_extern _ct_inverse_mod_256 .p2align 5 _ct_inverse_mod_256: diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S index c7d9ba8488e..b9c3acde200 100644 --- a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S @@ -1,6 +1,7 @@ .text .globl _ct_inverse_mod_383 +.private_extern _ct_inverse_mod_383 .align 5 _ct_inverse_mod_383: @@ -71,7 +72,7 @@ _ct_inverse_mod_383: adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] @@ -82,7 +83,7 @@ _ct_inverse_mod_383: adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*14] stp x5, x5, [x0,#8*16] eor x1, x1, #256 // flip-flop src |a|b|u|v| diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S index b5c953d287a..9fe0df88b59 100644 --- a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S +++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S @@ -1,6 +1,7 @@ .text .globl _ct_is_square_mod_384 +.private_extern _ct_is_square_mod_384 .align 5 _ct_is_square_mod_384: diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s index f2823941167..5faadb8dbff 100644 --- a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s +++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s @@ -1,6 +1,7 @@ .text .globl _ct_is_square_mod_384 +.private_extern _ct_is_square_mod_384 .p2align 5 _ct_is_square_mod_384: diff --git a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s index 185a876b87c..eebe131d0cb 100644 --- a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s +++ b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s @@ -1,6 +1,8 @@ +.comm ___blst_platform_cap,4 .text .globl _ct_inverse_mod_383 +.private_extern _ct_inverse_mod_383 .p2align 5 _ct_inverse_mod_383: @@ -8,6 +10,10 @@ _ct_inverse_mod_383: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz ct_inverse_mod_383$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s index 3e05df3a4b3..3f999075813 100644 --- a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s +++ b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s @@ -1,6 +1,7 @@ .text .globl _ctx_inverse_mod_383 +.private_extern _ctx_inverse_mod_383 .p2align 5 _ctx_inverse_mod_383: @@ -8,6 +9,7 @@ _ctx_inverse_mod_383: .byte 0xf3,0x0f,0x1e,0xfa +ct_inverse_mod_383$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -810,7 +812,7 @@ _ctx_inverse_mod_383: movq 48(%rsi),%r10 - call __inner_loop_62 + call __tail_loop_53 @@ -1523,7 +1525,7 @@ L$oop_31: .p2align 5 -__inner_loop_62: +__tail_loop_53: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -1532,7 +1534,7 @@ __inner_loop_62: xorq %r12,%r12 movq $1,%r13 -L$oop_62: +L$oop_53: xorq %rax,%rax testq $1,%r8 movq %r10,%rbx @@ -1559,7 +1561,7 @@ L$oop_62: subq %rax,%rdx subq %rbx,%rcx subl $1,%edi - jnz L$oop_62 + jnz L$oop_53 .byte 0xf3,0xc3 .cfi_endproc diff --git a/crypto/blst_src/build/mach-o/div3w-armv8.S b/crypto/blst_src/build/mach-o/div3w-armv8.S index 5a5eb3a01d7..4b130080123 100644 --- a/crypto/blst_src/build/mach-o/div3w-armv8.S +++ b/crypto/blst_src/build/mach-o/div3w-armv8.S @@ -25,7 +25,7 @@ Loop: asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + specilative bit + add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit diff --git a/crypto/blst_src/build/mach-o/div3w-x86_64.s b/crypto/blst_src/build/mach-o/div3w-x86_64.s index 8075571c87d..99a94d50a2b 100644 --- a/crypto/blst_src/build/mach-o/div3w-x86_64.s +++ b/crypto/blst_src/build/mach-o/div3w-x86_64.s @@ -8,6 +8,8 @@ _div_3_limbs: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa + + movq (%rdi),%r8 movq 8(%rdi),%r9 xorq %rax,%rax @@ -39,8 +41,9 @@ L$oop: orq %rcx,%rax + .byte 0xf3,0xc3 -.cfi_endproc +.cfi_endproc .globl _quot_rem_128 .private_extern _quot_rem_128 @@ -50,6 +53,8 @@ _quot_rem_128: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax movq %rdx,%rcx @@ -84,8 +89,9 @@ _quot_rem_128: movq %rcx,%rax + .byte 0xf3,0xc3 -.cfi_endproc +.cfi_endproc @@ -100,6 +106,8 @@ _quot_rem_64: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax imulq 0(%rsi),%rdx @@ -110,6 +118,7 @@ _quot_rem_64: movq %r10,0(%rdi) movq %rax,8(%rdi) + .byte 0xf3,0xc3 -.cfi_endproc +.cfi_endproc diff --git a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s index d83f5440342..842c39225b6 100644 --- a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s +++ b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s @@ -1,3 +1,4 @@ +.comm ___blst_platform_cap,4 .text .globl _mul_mont_sparse_256 @@ -9,6 +10,10 @@ _mul_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_mont_sparse_256$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -73,6 +78,10 @@ _sqr_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_mont_sparse_256$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -419,6 +428,10 @@ _from_mont_256: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz from_mont_256$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -494,6 +507,10 @@ _redc_mont_256: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz redc_mont_256$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s index 0d8ac89cfc2..7052343d0ac 100644 --- a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s +++ b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s @@ -1,3 +1,4 @@ +.comm ___blst_platform_cap,4 .text @@ -8,7 +9,7 @@ .p2align 5 -__sub_mod_384x384: +__subq_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -77,7 +78,7 @@ __sub_mod_384x384: .p2align 5 -__add_mod_384: +__addq_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -129,7 +130,7 @@ __add_mod_384: .p2align 5 -__sub_mod_384: +__subq_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -140,7 +141,7 @@ __sub_mod_384: movq 32(%rsi),%r12 movq 40(%rsi),%r13 -__sub_mod_384_a_is_loaded: +__subq_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 @@ -187,6 +188,10 @@ _mul_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_mont_384x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -232,12 +237,12 @@ _mul_mont_384x: movq 8(%rsp),%rcx leaq -48(%rsi),%rdx leaq 40+192+48(%rsp),%rdi - call __add_mod_384 + call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi - call __add_mod_384 + call __addq_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi @@ -247,17 +252,17 @@ _mul_mont_384x: leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 movq %rcx,%rbx @@ -266,14 +271,14 @@ _mul_mont_384x: movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 @@ -303,6 +308,10 @@ _sqr_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_mont_384x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -333,13 +342,13 @@ _sqr_mont_384x: leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi - call __add_mod_384 + call __addq_mod_384 movq 16(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi - call __sub_mod_384 + call __subq_mod_384 movq 16(%rsp),%rsi @@ -427,6 +436,10 @@ _mul_382x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_382x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -521,18 +534,18 @@ _mul_382x: leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subq_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi - call __sub_mod_384x384 + call __subq_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 @@ -562,6 +575,10 @@ _sqr_382x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_382x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -617,7 +634,7 @@ _sqr_382x: leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi - call __sub_mod_384_a_is_loaded + call __subq_mod_384_a_is_loaded leaq (%rdi),%rsi @@ -695,6 +712,10 @@ _mul_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1019,6 +1040,10 @@ _sqr_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1265,6 +1290,10 @@ _sqr_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1299,7 +1328,7 @@ _sqr_mont_384: movq 104(%rsp),%rbx movq 112(%rsp),%rdi call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 leaq 120(%rsp),%r8 movq 120(%rsp),%r15 @@ -1332,6 +1361,10 @@ _redc_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz redc_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1356,7 +1389,7 @@ _redc_mont_384: movq %rdx,%rbx call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 movq 8(%rsp),%r15 .cfi_restore %r15 @@ -1389,6 +1422,10 @@ _from_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz from_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1764,7 +1801,7 @@ __mulq_by_1_mont_384: .p2align 5 -__redc_tail_mont_384: +__redq_tail_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -1820,6 +1857,10 @@ _sgn0_pty_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sgn0_pty_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1898,6 +1939,10 @@ _sgn0_pty_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sgn0_pty_mont_384x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2025,6 +2070,10 @@ _mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2689,6 +2738,10 @@ _sqr_n_mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_n_mul_mont_384$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2727,7 +2780,7 @@ L$oop_sqr_384: movq 0(%rsp),%rcx movq 16(%rsp),%rbx call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 movd %xmm1,%edx leaq 0(%rdi),%rsi @@ -2777,6 +2830,10 @@ _sqr_n_mul_mont_383: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_n_mul_mont_383$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -3438,6 +3495,10 @@ _sqr_mont_382x: .byte 0xf3,0x0f,0x1e,0xfa +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_mont_382x$1 +#endif pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s index 178372f41b2..ae9a76b739c 100644 --- a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s +++ b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s @@ -9,6 +9,7 @@ _mulx_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa +mul_mont_sparse_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -71,6 +72,7 @@ _sqrx_mont_sparse_256: .byte 0xf3,0x0f,0x1e,0xfa +sqr_mont_sparse_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -332,6 +334,7 @@ _fromx_mont_256: .byte 0xf3,0x0f,0x1e,0xfa +from_mont_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -407,6 +410,7 @@ _redcx_mont_256: .byte 0xf3,0x0f,0x1e,0xfa +redc_mont_256$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s index 95d3dadcc67..c5afeec8a51 100644 --- a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s +++ b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s @@ -8,7 +8,7 @@ .p2align 5 -__sub_mod_384x384: +__subx_mod_384x384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -77,7 +77,7 @@ __sub_mod_384x384: .p2align 5 -__add_mod_384: +__addx_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -129,7 +129,7 @@ __add_mod_384: .p2align 5 -__sub_mod_384: +__subx_mod_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -140,7 +140,7 @@ __sub_mod_384: movq 32(%rsi),%r12 movq 40(%rsi),%r13 -__sub_mod_384_a_is_loaded: +__subx_mod_384_a_is_loaded: subq 0(%rdx),%r8 movq 0(%rcx),%r14 sbbq 8(%rdx),%r9 @@ -187,6 +187,7 @@ _mulx_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +mul_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -233,12 +234,12 @@ _mulx_mont_384x: leaq (%rbx),%rsi leaq -48(%rbx),%rdx leaq 40+192+48(%rsp),%rdi - call __add_mod_384 + call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq -48(%rdi),%rdi - call __add_mod_384 + call __addx_mod_384 leaq (%rdi),%rbx leaq 48(%rdi),%rsi @@ -248,17 +249,17 @@ _mulx_mont_384x: leaq (%rdi),%rsi leaq 40(%rsp),%rdx movq 8(%rsp),%rcx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq (%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 40(%rsp),%rsi leaq 40+96(%rsp),%rdx leaq 40(%rsp),%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq (%rcx),%rbx @@ -267,14 +268,14 @@ _mulx_mont_384x: movq 0(%rsp),%rcx movq 32(%rsp),%rdi call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 leaq 40+192(%rsp),%rsi movq 0(%rsp),%rcx leaq 48(%rdi),%rdi call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 leaq 328(%rsp),%r8 movq 0(%r8),%r15 @@ -304,6 +305,7 @@ _sqrx_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +sqr_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -335,13 +337,13 @@ _sqrx_mont_384x: leaq 48(%rsi),%rdx leaq 32(%rsp),%rdi - call __add_mod_384 + call __addx_mod_384 movq 24(%rsp),%rsi leaq 48(%rsi),%rdx leaq 32+48(%rsp),%rdi - call __sub_mod_384 + call __subx_mod_384 movq 24(%rsp),%rsi @@ -439,6 +441,7 @@ _mulx_382x: .byte 0xf3,0x0f,0x1e,0xfa +mul_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -533,18 +536,18 @@ _mulx_382x: leaq 32(%rsp),%rdx movq 24(%rsp),%rcx movq %rsi,%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 0(%rdi),%rsi leaq -96(%rdi),%rdx - call __sub_mod_384x384 + call __subx_mod_384x384 leaq -96(%rdi),%rsi leaq 32(%rsp),%rdx leaq -96(%rdi),%rdi - call __sub_mod_384x384 + call __subx_mod_384x384 leaq 136(%rsp),%r8 movq 0(%r8),%r15 @@ -574,6 +577,7 @@ _sqrx_382x: .byte 0xf3,0x0f,0x1e,0xfa +sqr_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -629,7 +633,7 @@ _sqrx_382x: leaq 48(%rsi),%rdx leaq 48(%rdi),%rdi - call __sub_mod_384_a_is_loaded + call __subx_mod_384_a_is_loaded leaq (%rdi),%rsi @@ -707,6 +711,7 @@ _mulx_384: .byte 0xf3,0x0f,0x1e,0xfa +mul_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -933,6 +938,7 @@ _sqrx_384: .byte 0xf3,0x0f,0x1e,0xfa +sqr_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1127,6 +1133,7 @@ _redcx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +redc_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1151,7 +1158,7 @@ _redcx_mont_384: movq %rdx,%rbx call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 movq 8(%rsp),%r15 .cfi_restore %r15 @@ -1184,6 +1191,7 @@ _fromx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +from_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1450,7 +1458,7 @@ __mulx_by_1_mont_384: .p2align 5 -__redc_tail_mont_384: +__redx_tail_mont_384: .cfi_startproc .byte 0xf3,0x0f,0x1e,0xfa @@ -1506,6 +1514,7 @@ _sgn0x_pty_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +sgn0_pty_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1584,6 +1593,7 @@ _sgn0x_pty_mont_384x: .byte 0xf3,0x0f,0x1e,0xfa +sgn0_pty_mont_384x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -1711,6 +1721,7 @@ _mulx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +mul_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2178,6 +2189,7 @@ _sqrx_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +sqr_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2245,6 +2257,7 @@ _sqrx_n_mul_mont_384: .byte 0xf3,0x0f,0x1e,0xfa +sqr_n_mul_mont_384$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2330,6 +2343,7 @@ _sqrx_n_mul_mont_383: .byte 0xf3,0x0f,0x1e,0xfa +sqr_n_mul_mont_383$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 @@ -2776,6 +2790,7 @@ _sqrx_mont_382x: .byte 0xf3,0x0f,0x1e,0xfa +sqr_mont_382x$1: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 diff --git a/crypto/blst_src/build/mach-o/sha256-armv8.S b/crypto/blst_src/build/mach-o/sha256-armv8.S index c928f75025f..3f3c1266dcd 100644 --- a/crypto/blst_src/build/mach-o/sha256-armv8.S +++ b/crypto/blst_src/build/mach-o/sha256-armv8.S @@ -10,11 +10,12 @@ // // sha256_block procedure for ARMv8. // -// This module is stripped of scalar code paths, with raionale that all +// This module is stripped of scalar code paths, with rationale that all // known processors are NEON-capable. // // See original module at CRYPTOGAMS for further details. +.comm ___blst_platform_cap,4 .text .align 6 @@ -184,6 +185,11 @@ Loop_hw: .align 4 _blst_sha256_block_data_order: + adrp x16,___blst_platform_cap@PAGE + ldr w16,[x16,___blst_platform_cap@PAGEOFF] + tst w16,#1 + b.ne Lv8_entry + stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 diff --git a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s index 3f000720d00..9f0a4f84ff0 100644 --- a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s +++ b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s @@ -1,3 +1,4 @@ +.comm ___blst_platform_cap,4 .text .globl _blst_sha256_block_data_order @@ -8,33 +9,35 @@ _blst_sha256_block_data_order: .byte 0xf3,0x0f,0x1e,0xfa - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-24 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp +#ifdef __BLST_PORTABLE__ + testl $2,___blst_platform_cap(%rip) + jnz L$blst_sha256_block_data_order$2 +#endif + pushq %rbx +.cfi_offset %rbx,-24 pushq %r12 -.cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 -.cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 -.cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 -.cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 shlq $4,%rdx subq $64+24,%rsp -.cfi_adjust_cfa_offset 16*4+3*8 + +.cfi_def_cfa %rsp,144 + leaq (%rsi,%rdx,4),%rdx movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx @@ -1636,23 +1639,23 @@ L$rounds_16_xx: leaq 64+24+48(%rsp),%r11 .cfi_def_cfa %r11,8 movq 64+24(%rsp),%r15 -.cfi_restore %r15 movq -40(%r11),%r14 -.cfi_restore %r14 movq -32(%r11),%r13 -.cfi_restore %r13 movq -24(%r11),%r12 + movq -16(%r11),%rbx + movq -8(%r11),%rbp .cfi_restore %r12 - movq -16(%r11),%rbp +.cfi_restore %r13 +.cfi_restore %r14 +.cfi_restore %r15 .cfi_restore %rbp - movq -8(%r11),%rbx .cfi_restore %rbx - leaq (%r11),%rsp .byte 0xf3,0xc3 .cfi_endproc +#ifndef __BLST_PORTABLE__ .p2align 6 K256: @@ -1744,3 +1747,4 @@ _blst_sha256_hcopy: .byte 0xf3,0xc3 .cfi_endproc +#endif diff --git a/crypto/blst_src/build/mach-o/sha256-x86_64.s b/crypto/blst_src/build/mach-o/sha256-x86_64.s index dee75e35362..cff024eed4f 100644 --- a/crypto/blst_src/build/mach-o/sha256-x86_64.s +++ b/crypto/blst_src/build/mach-o/sha256-x86_64.s @@ -1,3 +1,4 @@ +.comm ___blst_platform_cap,4 .text .p2align 6 @@ -33,6 +34,13 @@ _blst_sha256_block_data_order_shaext: .byte 0xf3,0x0f,0x1e,0xfa + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp +L$blst_sha256_block_data_order$2: + leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 @@ -234,6 +242,11 @@ L$oop_shaext: movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + .byte 0xf3,0xc3 .cfi_endproc @@ -249,30 +262,27 @@ _blst_sha256_block_data_order: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + testl $2,___blst_platform_cap(%rip) + jnz L$blst_sha256_block_data_order$2 pushq %rbx -.cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 -.cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 -.cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 -.cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 -.cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 shlq $4,%rdx - subq $40,%rsp -.cfi_adjust_cfa_offset 40 + subq $24,%rsp + leaq (%rsi,%rdx,4),%rdx - movq %rdi,0(%rsp) + movq %rdi,-64(%rbp) - movq %rdx,16(%rsp) - movq %rsp,%rbp -.cfi_def_cfa_register %rbp + movq %rdx,-48(%rbp) leaq -64(%rsp),%rsp @@ -291,7 +301,7 @@ _blst_sha256_block_data_order: .p2align 4 L$loop_ssse3: movdqa K256+256(%rip),%xmm7 - movq %rsi,8(%rbp) + movq %rsi,-56(%rbp) movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 @@ -1316,9 +1326,9 @@ L$ssse3_00_47: addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d - movq 0(%rbp),%rdi + movq -64(%rbp),%rdi movl %r14d,%eax - movq 8(%rbp),%rsi + movq -56(%rbp),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx @@ -1330,7 +1340,7 @@ L$ssse3_00_47: addl 28(%rdi),%r11d leaq 64(%rsi),%rsi - cmpq 16(%rbp),%rsi + cmpq -48(%rbp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) @@ -1343,26 +1353,25 @@ L$ssse3_00_47: jb L$loop_ssse3 xorps %xmm0,%xmm0 - leaq 40+48(%rbp),%r11 -.cfi_def_cfa %r11,8 movaps %xmm0,0(%rsp) movaps %xmm0,16(%rsp) movaps %xmm0,32(%rsp) movaps %xmm0,48(%rsp) - movq 40(%rbp),%r15 -.cfi_restore %r15 - movq -40(%r11),%r14 -.cfi_restore %r14 - movq -32(%r11),%r13 -.cfi_restore %r13 - movq -24(%r11),%r12 + movq -40(%rbp),%r15 + movq -32(%rbp),%r14 + movq -24(%rbp),%r13 + movq -16(%rbp),%r12 + movq -8(%rbp),%rbx + movq %rbp,%rsp +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp .cfi_restore %r12 - movq -16(%r11),%rbx +.cfi_restore %r13 +.cfi_restore %r14 +.cfi_restore %r15 .cfi_restore %rbx - movq -8(%r11),%rbp -.cfi_restore %rbp - - leaq (%r11),%rsp .byte 0xf3,0xc3 .cfi_endproc diff --git a/crypto/blst_src/build/refresh.sh b/crypto/blst_src/build/refresh.sh new file mode 100755 index 00000000000..56b0b279c69 --- /dev/null +++ b/crypto/blst_src/build/refresh.sh @@ -0,0 +1,48 @@ +#!/bin/sh + +HERE=`dirname $0` +cd "${HERE}" + +PERL=${PERL:-perl} + +for pl in ../src/asm/*-x86_64.pl; do + s=`basename $pl .pl`.asm + expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s) + s=`basename $pl .pl`.s + (set -x; ${PERL} $pl elf > elf/$s) + (set -x; ${PERL} $pl mingw64 > coff/$s) + (set -x; ${PERL} $pl macosx > mach-o/$s) +done + +for pl in ../src/asm/*-armv8.pl; do + s=`basename $pl .pl`.asm + (set -x; ${PERL} $pl win64 > win64/$s) + s=`basename $pl .pl`.S + (set -x; ${PERL} $pl linux64 > elf/$s) + (set -x; ${PERL} $pl coff64 > coff/$s) + (set -x; ${PERL} $pl ios64 > mach-o/$s) +done + +( cd ../bindings; + echo "LIBRARY blst" + echo + echo "EXPORTS" + cc -E blst.h | \ + ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) && print "\t$1\n" }' + echo +) > win64/blst.def + +if which bindgen > /dev/null 2>&1; then + ( cd ../bindings; set -x; + bindgen --opaque-type blst_pairing \ + --opaque-type blst_uniq \ + --with-derive-default \ + --with-derive-eq \ + --rustified-enum BLST.\* \ + blst.h -- -D__BLST_RUST_BINDGEN__ \ + | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs + ) +else + echo "Install Rust bindgen with 'cargo install bindgen-cli'" 1>&2 + exit 1 +fi diff --git a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm index 09a5c17975d..d5308b8f809 100644 --- a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm +++ b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm @@ -11,15 +11,14 @@ add_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx sub rsp,8 @@ -84,14 +83,13 @@ mul_by_3_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_3_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 push rbx push r12 @@ -130,6 +128,7 @@ mul_by_3_mod_256 ENDP ALIGN 32 __lshift_mod_256 PROC PRIVATE DB 243,15,30,250 + add r8,r8 adc r9,r9 mov rax,r8 @@ -165,15 +164,14 @@ lshift_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_lshift_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -224,15 +222,14 @@ rshift_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_rshift_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx sub rsp,8 @@ -315,15 +312,14 @@ cneg_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_cneg_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -395,15 +391,14 @@ sub_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx sub rsp,8 @@ -466,11 +461,10 @@ check_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_check_mod_256:: - mov rdi,rcx - mov rsi,rdx - + mov rdi,rcx + mov rsi,rdx mov rax,QWORD PTR[rdi] mov r9,QWORD PTR[8+rdi] mov r10,QWORD PTR[16+rdi] @@ -511,15 +505,14 @@ add_n_check_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_n_check_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx sub rsp,8 @@ -589,15 +582,14 @@ sub_n_check_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_n_check_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx sub rsp,8 @@ -764,8 +756,9 @@ $L$SEH_info_add_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_add_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h @@ -773,7 +766,8 @@ DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_add_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -784,8 +778,9 @@ $L$SEH_info_mul_by_3_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_by_3_mod_256_body:: DB 1,0,11,0 DB 000h,0c4h,000h,000h @@ -805,8 +800,9 @@ $L$SEH_info_lshift_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_lshift_mod_256_body:: DB 1,0,11,0 DB 000h,0c4h,000h,000h @@ -826,8 +822,9 @@ $L$SEH_info_rshift_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_rshift_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h @@ -835,7 +832,8 @@ DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_rshift_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -846,8 +844,9 @@ $L$SEH_info_cneg_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_cneg_mod_256_body:: DB 1,0,11,0 DB 000h,0c4h,000h,000h @@ -867,8 +866,9 @@ $L$SEH_info_sub_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sub_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h @@ -876,7 +876,8 @@ DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sub_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -893,8 +894,9 @@ $L$SEH_info_add_n_check_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_add_n_check_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h @@ -902,7 +904,8 @@ DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_add_n_check_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -913,8 +916,9 @@ $L$SEH_info_sub_n_check_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sub_n_check_mod_256_body:: DB 1,0,9,0 DB 000h,034h,001h,000h @@ -922,7 +926,8 @@ DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sub_n_check_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm index 8a7b9e255db..560e02ee105 100644 --- a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm +++ b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm @@ -11,15 +11,14 @@ add_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_mod_384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -64,6 +63,7 @@ add_mod_384 ENDP ALIGN 32 __add_mod_384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -120,15 +120,14 @@ add_mod_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_mod_384x:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -190,15 +189,14 @@ rshift_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_rshift_mod_384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -260,6 +258,7 @@ rshift_mod_384 ENDP ALIGN 32 __rshift_mod_384 PROC PRIVATE DB 243,15,30,250 + mov rsi,1 mov r14,QWORD PTR[rcx] and rsi,r8 @@ -320,14 +319,13 @@ div_by_2_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_div_by_2_mod_384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 push rbx push r12 @@ -394,15 +392,14 @@ lshift_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_lshift_mod_384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -493,6 +490,7 @@ lshift_mod_384 ENDP ALIGN 32 __lshift_mod_384 PROC PRIVATE DB 243,15,30,250 + add r8,r8 adc r9,r9 adc r10,r10 @@ -536,14 +534,13 @@ mul_by_3_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_3_mod_384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 push rbx push r12 @@ -605,14 +602,13 @@ mul_by_8_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_8_mod_384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 push rbx push r12 @@ -681,14 +677,13 @@ mul_by_3_mod_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_3_mod_384x:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 push rbx push r12 @@ -766,14 +761,13 @@ mul_by_8_mod_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_8_mod_384x:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 push rbx push r12 @@ -861,15 +855,14 @@ cneg_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_cneg_mod_384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -970,15 +963,14 @@ sub_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_mod_384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -1023,6 +1015,7 @@ sub_mod_384 ENDP ALIGN 32 __sub_mod_384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -1077,15 +1070,14 @@ sub_mod_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_mod_384x:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -1145,14 +1137,13 @@ mul_by_1_plus_i_mod_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_by_1_plus_i_mod_384x:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 push rbx push r12 @@ -1297,11 +1288,10 @@ sgn0_pty_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0_pty_mod_384:: - mov rdi,rcx - mov rsi,rdx - + mov rdi,rcx + mov rsi,rdx $L$SEH_body_sgn0_pty_mod_384:: mov r8,QWORD PTR[rdi] @@ -1353,13 +1343,12 @@ sgn0_pty_mod_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0_pty_mod_384x:: - mov rdi,rcx - mov rsi,rdx - push rbp + mov rdi,rcx + mov rsi,rdx push rbx sub rsp,8 @@ -1472,6 +1461,7 @@ PUBLIC vec_select_32 ALIGN 32 vec_select_32 PROC PUBLIC DB 243,15,30,250 + movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 @@ -1500,6 +1490,7 @@ PUBLIC vec_select_48 ALIGN 32 vec_select_48 PROC PUBLIC DB 243,15,30,250 + movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 @@ -1534,6 +1525,7 @@ PUBLIC vec_select_96 ALIGN 32 vec_select_96 PROC PUBLIC DB 243,15,30,250 + movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 @@ -1586,6 +1578,7 @@ PUBLIC vec_select_192 ALIGN 32 vec_select_192 PROC PUBLIC DB 243,15,30,250 + movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 @@ -1674,6 +1667,7 @@ PUBLIC vec_select_144 ALIGN 32 vec_select_144 PROC PUBLIC DB 243,15,30,250 + movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 @@ -1744,6 +1738,7 @@ PUBLIC vec_select_288 ALIGN 32 vec_select_288 PROC PUBLIC DB 243,15,30,250 + movd xmm5,r9d pxor xmm4,xmm4 pshufd xmm5,xmm5,0 @@ -1868,6 +1863,7 @@ PUBLIC vec_prefetch ALIGN 32 vec_prefetch PROC PUBLIC DB 243,15,30,250 + lea rdx,QWORD PTR[((-1))+rdx*1+rcx] mov rax,64 xor r8,r8 @@ -1909,6 +1905,7 @@ PUBLIC vec_is_zero_16x ALIGN 32 vec_is_zero_16x PROC PUBLIC DB 243,15,30,250 + shr edx,4 movdqu xmm0,XMMWORD PTR[rcx] lea rcx,QWORD PTR[16+rcx] @@ -1937,6 +1934,7 @@ PUBLIC vec_is_equal_16x ALIGN 32 vec_is_equal_16x PROC PUBLIC DB 243,15,30,250 + shr r8d,4 movdqu xmm0,XMMWORD PTR[rcx] movdqu xmm1,XMMWORD PTR[rdx] @@ -2154,8 +2152,9 @@ $L$SEH_info_add_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_add_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2167,7 +2166,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_add_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2178,8 +2178,9 @@ $L$SEH_info_add_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_add_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h @@ -2191,7 +2192,8 @@ DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_add_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2202,8 +2204,9 @@ $L$SEH_info_rshift_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_rshift_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2215,7 +2218,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_rshift_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2226,8 +2230,9 @@ $L$SEH_info_div_by_2_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_div_by_2_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2239,7 +2244,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_div_by_2_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2250,8 +2256,9 @@ $L$SEH_info_lshift_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_lshift_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2263,7 +2270,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_lshift_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2274,8 +2282,9 @@ $L$SEH_info_mul_by_3_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_by_3_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2287,7 +2296,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_by_3_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2298,8 +2308,9 @@ $L$SEH_info_mul_by_8_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_by_8_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2311,7 +2322,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_by_8_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2322,8 +2334,9 @@ $L$SEH_info_mul_by_3_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_by_3_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2335,7 +2348,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_by_3_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2346,8 +2360,9 @@ $L$SEH_info_mul_by_8_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_by_8_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2359,7 +2374,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_by_8_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2370,8 +2386,9 @@ $L$SEH_info_cneg_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_cneg_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2383,7 +2400,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_cneg_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2394,8 +2412,9 @@ $L$SEH_info_sub_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sub_mod_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -2407,7 +2426,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2418,8 +2438,9 @@ $L$SEH_info_sub_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sub_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h @@ -2431,7 +2452,8 @@ DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2442,8 +2464,9 @@ $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_by_1_plus_i_mod_384x_body:: DB 1,0,17,0 DB 000h,0f4h,007h,000h @@ -2455,7 +2478,8 @@ DB 000h,054h,00ch,000h DB 000h,074h,00eh,000h DB 000h,064h,00fh,000h DB 000h,0c2h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2466,8 +2490,9 @@ $L$SEH_info_sgn0_pty_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sgn0_pty_mod_384_body:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -2483,8 +2508,9 @@ $L$SEH_info_sgn0_pty_mod_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sgn0_pty_mod_384x_body:: DB 1,0,9,0 DB 000h,034h,001h,000h @@ -2492,7 +2518,8 @@ DB 000h,054h,002h,000h DB 000h,074h,004h,000h DB 000h,064h,005h,000h DB 000h,022h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mod_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm index 57d1752fd3c..59b51a910ce 100644 --- a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm +++ b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm @@ -5,6 +5,7 @@ OPTION DOTNAME ALIGN 32 __add_mod_384x384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -72,6 +73,7 @@ __add_mod_384x384 ENDP ALIGN 32 __sub_mod_384x384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -144,15 +146,14 @@ add_mod_384x384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_add_mod_384x384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -203,15 +204,14 @@ sub_mod_384x384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sub_mod_384x384:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -285,8 +285,9 @@ $L$SEH_info_add_mod_384x384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_add_mod_384x384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -298,7 +299,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_add_mod_384x384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -309,8 +311,9 @@ $L$SEH_info_sub_mod_384x384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sub_mod_384x384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -322,7 +325,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sub_mod_384x384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/blst.def b/crypto/blst_src/build/win64/blst.def index 3fbb6b3a97d..dda95336a93 100644 --- a/crypto/blst_src/build/win64/blst.def +++ b/crypto/blst_src/build/win64/blst.def @@ -152,6 +152,7 @@ EXPORTS blst_sk_to_pk_in_g2 blst_sign_pk_in_g2 blst_miller_loop + blst_miller_loop_n blst_final_exp blst_precompute_lines blst_miller_loop_lines @@ -180,6 +181,8 @@ EXPORTS BLS12_381_NEG_G1 BLS12_381_G2 BLS12_381_NEG_G2 + blst_fr_ct_bfly + blst_fr_gs_bfly blst_fr_to blst_fr_from blst_fp_to @@ -214,4 +217,5 @@ EXPORTS blst_p2_sizeof blst_p2_affine_sizeof blst_fp12_sizeof + blst_sha256 diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm index f3c2f0d05f9..a4467904612 100644 --- a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm +++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm @@ -1,6 +1,7 @@ AREA |.text|,CODE,ALIGN=8,ARM64 + EXPORT |ct_inverse_mod_256|[FUNC] ALIGN 32 |ct_inverse_mod_256| PROC @@ -60,14 +61,14 @@ madd x4, x16, x8, xzr // |u|*|f0| madd x4, x17, x9, x4 // |v|*|g0| str x4, [x0,#8*4] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*5] stp x5, x5, [x0,#8*7] madd x4, x12, x8, xzr // |u|*|f1| madd x4, x13, x9, x4 // |v|*|g1| str x4, [x0,#8*9] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*10] stp x5, x5, [x0,#8*12] eor x1, x1, #256 // flip-flop src |a|b|u|v| diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm index 65665c9f17a..5cd09a1d8f2 100644 --- a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm +++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm @@ -3,6 +3,7 @@ OPTION DOTNAME PUBLIC ct_inverse_mod_256 + ALIGN 32 ct_inverse_mod_256 PROC PUBLIC DB 243,15,30,250 @@ -10,15 +11,14 @@ ct_inverse_mod_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_ct_inverse_mod_256:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbp + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 push rbx push r12 @@ -643,6 +643,7 @@ ct_inverse_mod_256 ENDP ALIGN 32 __smulq_512x63 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -791,6 +792,7 @@ __smulq_512x63 ENDP ALIGN 32 __smulq_256x63 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[((0+0))+rsi] mov r9,QWORD PTR[((0+8))+rsi] mov r10,QWORD PTR[((0+16))+rsi] @@ -898,6 +900,7 @@ __smulq_256x63 ENDP ALIGN 32 __smulq_256_n_shift_by_31 PROC PRIVATE DB 243,15,30,250 + mov QWORD PTR[rdi],rdx mov QWORD PTR[8+rdi],rcx mov rbp,rdx @@ -1026,6 +1029,7 @@ __smulq_256_n_shift_by_31 ENDP ALIGN 32 __ab_approximation_31_256 PROC PRIVATE DB 243,15,30,250 + mov r9,QWORD PTR[24+rsi] mov r11,QWORD PTR[56+rsi] mov rbx,QWORD PTR[16+rsi] @@ -1079,6 +1083,7 @@ __ab_approximation_31_256 ENDP ALIGN 32 __inner_loop_31_256 PROC PRIVATE DB 243,15,30,250 + mov rcx,07FFFFFFF80000000h mov r13,0800000007FFFFFFFh mov r15,07FFFFFFF7FFFFFFFh @@ -1127,6 +1132,7 @@ __inner_loop_31_256 ENDP ALIGN 32 __inner_loop_62_256 PROC PRIVATE DB 243,15,30,250 + mov r15d,edx mov rdx,1 xor rcx,rcx @@ -1187,8 +1193,9 @@ $L$SEH_info_ct_inverse_mod_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_ct_inverse_mod_256_body:: DB 1,0,18,0 DB 000h,0f4h,086h,000h @@ -1200,6 +1207,8 @@ DB 000h,054h,08bh,000h DB 000h,074h,08dh,000h DB 000h,064h,08eh,000h DB 000h,001h,08ch,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_ct_inverse_mod_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm index 4ab12e052df..311ce7638ce 100644 --- a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm +++ b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm @@ -1,6 +1,7 @@ AREA |.text|,CODE,ALIGN=8,ARM64 + EXPORT |ct_inverse_mod_383|[FUNC] ALIGN 32 |ct_inverse_mod_383| PROC @@ -71,7 +72,7 @@ adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*8] stp x5, x5, [x0,#8*10] @@ -82,7 +83,7 @@ adds x3, x3, x5 adc x4, x4, x6 stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extenstion + asr x5, x4, #63 // sign extension stp x5, x5, [x0,#8*14] stp x5, x5, [x0,#8*16] eor x1, x1, #256 // flip-flop src |a|b|u|v| diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm index ab72328f056..e2454897b33 100644 --- a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm +++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm @@ -1,6 +1,7 @@ AREA |.text|,CODE,ALIGN=8,ARM64 + EXPORT |ct_is_square_mod_384|[FUNC] ALIGN 32 |ct_is_square_mod_384| PROC diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm index 38de6fc1229..be00f479efb 100644 --- a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm +++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm @@ -3,6 +3,7 @@ OPTION DOTNAME PUBLIC ct_is_square_mod_384 + ALIGN 32 ct_is_square_mod_384 PROC PUBLIC DB 243,15,30,250 @@ -10,13 +11,12 @@ ct_is_square_mod_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_ct_is_square_mod_384:: - mov rdi,rcx - mov rsi,rdx - push rbp + mov rdi,rcx + mov rsi,rdx push rbx push r12 @@ -133,6 +133,7 @@ ct_is_square_mod_384 ENDP ALIGN 32 __smulq_384_n_shift_by_30 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -305,6 +306,7 @@ __smulq_384_n_shift_by_30 ENDP ALIGN 32 __ab_approximation_30 PROC PRIVATE DB 243,15,30,250 + mov rbx,QWORD PTR[88+rsi] mov r15,QWORD PTR[80+rsi] mov r14,QWORD PTR[72+rsi] @@ -369,6 +371,7 @@ __ab_approximation_30 ENDP ALIGN 32 __inner_loop_30 PROC PRIVATE DB 243,15,30,250 + mov rbx,07FFFFFFF80000000h mov rcx,0800000007FFFFFFFh lea r15,QWORD PTR[((-1))+rbx] @@ -430,6 +433,7 @@ __inner_loop_30 ENDP ALIGN 32 __inner_loop_48 PROC PRIVATE DB 243,15,30,250 + mov edi,48 $L$oop_48:: @@ -485,8 +489,9 @@ $L$SEH_info_ct_is_square_mod_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_ct_is_square_mod_384_body:: DB 1,0,18,0 DB 000h,0f4h,043h,000h @@ -498,6 +503,8 @@ DB 000h,054h,048h,000h DB 000h,074h,04ah,000h DB 000h,064h,04bh,000h DB 000h,001h,049h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_ct_is_square_mod_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm index de79f8ec80e..89fbe5d0666 100644 --- a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm +++ b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm @@ -1,8 +1,13 @@ OPTION DOTNAME +EXTERN ct_inverse_mod_383$1:NEAR +_DATA SEGMENT +COMM __blst_platform_cap:DWORD:1 +_DATA ENDS .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC ct_inverse_mod_383 + ALIGN 32 ct_inverse_mod_383 PROC PUBLIC DB 243,15,30,250 @@ -10,13 +15,16 @@ ct_inverse_mod_383 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_ct_inverse_mod_383:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz ct_inverse_mod_383$1 +endif push rbp push rbx @@ -548,6 +556,7 @@ ct_inverse_mod_383 ENDP ALIGN 32 __smulq_767x63 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -758,6 +767,7 @@ __smulq_767x63 ENDP ALIGN 32 __smulq_383x63 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -899,6 +909,7 @@ __smulq_383x63 ENDP ALIGN 32 __smulq_383_n_shift_by_62 PROC PRIVATE DB 243,15,30,250 + mov rbx,rdx mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] @@ -1075,6 +1086,7 @@ __smulq_383_n_shift_by_62 ENDP ALIGN 32 __ab_approximation_62 PROC PRIVATE DB 243,15,30,250 + mov r9,QWORD PTR[40+rsi] mov r11,QWORD PTR[88+rsi] mov rbx,QWORD PTR[32+rsi] @@ -1131,6 +1143,7 @@ ALIGN 8 DD 0 __inner_loop_62 PROC PRIVATE DB 243,15,30,250 + mov rdx,1 xor rcx,rcx xor r12,r12 @@ -1200,8 +1213,9 @@ $L$SEH_info_ct_inverse_mod_383_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_ct_inverse_mod_383_body:: DB 1,0,18,0 DB 000h,0f4h,08bh,000h @@ -1213,6 +1227,8 @@ DB 000h,054h,090h,000h DB 000h,074h,092h,000h DB 000h,064h,093h,000h DB 000h,001h,091h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_ct_inverse_mod_383_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm index df4c46a4c44..024da69a645 100644 --- a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm +++ b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm @@ -1,8 +1,10 @@ OPTION DOTNAME +PUBLIC ct_inverse_mod_383$1 .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC ctx_inverse_mod_383 + ALIGN 32 ctx_inverse_mod_383 PROC PUBLIC DB 243,15,30,250 @@ -10,13 +12,13 @@ ctx_inverse_mod_383 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_ctx_inverse_mod_383:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ct_inverse_mod_383$1:: push rbp push rbx @@ -814,7 +816,7 @@ $L$SEH_body_ctx_inverse_mod_383:: mov r10,QWORD PTR[48+rsi] - call __inner_loop_62 + call __tail_loop_53 @@ -890,6 +892,7 @@ ctx_inverse_mod_383 ENDP ALIGN 32 __smulx_767x63 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -1054,6 +1057,7 @@ __smulx_767x63 ENDP ALIGN 32 __smulx_383x63 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[((0+0))+rsi] mov r9,QWORD PTR[((0+8))+rsi] mov r10,QWORD PTR[((0+16))+rsi] @@ -1161,6 +1165,7 @@ __smulx_383x63 ENDP ALIGN 32 __smulx_383_n_shift_by_31 PROC PRIVATE DB 243,15,30,250 + mov rbx,rdx xor r14,r14 mov r8,QWORD PTR[((0+0))+rsi] @@ -1306,6 +1311,7 @@ __smulx_383_n_shift_by_31 ENDP ALIGN 32 __smulx_191_n_shift_by_31 PROC PRIVATE DB 243,15,30,250 + mov rbx,rdx mov r8,QWORD PTR[((0+0))+rsi] mov r9,QWORD PTR[((0+8))+rsi] @@ -1397,6 +1403,7 @@ __smulx_191_n_shift_by_31 ENDP ALIGN 32 __ab_approximation_31 PROC PRIVATE DB 243,15,30,250 + mov r9,QWORD PTR[40+rsi] mov r11,QWORD PTR[88+rsi] mov rbx,QWORD PTR[32+rsi] @@ -1467,6 +1474,7 @@ __ab_approximation_31 ENDP ALIGN 32 __inner_loop_31 PROC PRIVATE DB 243,15,30,250 + mov rcx,07FFFFFFF80000000h mov r13,0800000007FFFFFFFh mov r15,07FFFFFFF7FFFFFFFh @@ -1513,14 +1521,15 @@ __inner_loop_31 ENDP ALIGN 32 -__inner_loop_62 PROC PRIVATE +__tail_loop_53 PROC PRIVATE DB 243,15,30,250 + mov rdx,1 xor rcx,rcx xor r12,r12 mov r13,1 -$L$oop_62:: +$L$oop_53:: xor rax,rax test r8,1 mov rbx,r10 @@ -1547,10 +1556,10 @@ $L$oop_62:: sub rdx,rax sub rcx,rbx sub edi,1 - jnz $L$oop_62 + jnz $L$oop_53 DB 0F3h,0C3h ;repret -__inner_loop_62 ENDP +__tail_loop_53 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 @@ -1573,8 +1582,9 @@ $L$SEH_info_ctx_inverse_mod_383_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_ctx_inverse_mod_383_body:: DB 1,0,18,0 DB 000h,0f4h,08bh,000h @@ -1586,6 +1596,8 @@ DB 000h,054h,090h,000h DB 000h,074h,092h,000h DB 000h,064h,093h,000h DB 000h,001h,091h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_ctx_inverse_mod_383_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/div3w-armv8.asm b/crypto/blst_src/build/win64/div3w-armv8.asm index 7114ccf0c2e..aec90679eea 100644 --- a/crypto/blst_src/build/win64/div3w-armv8.asm +++ b/crypto/blst_src/build/win64/div3w-armv8.asm @@ -25,7 +25,7 @@ asr x3,x0,#63 // top bit -> mask add x0,x0,x0 // Q <<= 1 subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + specilative bit + add x0,x0,#1 // Q + speculative bit sbcs x7,x5,x2 sbc x0,x0,xzr // subtract speculative bit diff --git a/crypto/blst_src/build/win64/div3w-x86_64.asm b/crypto/blst_src/build/win64/div3w-x86_64.asm index c35f426f3d2..805c5b1fcb0 100644 --- a/crypto/blst_src/build/win64/div3w-x86_64.asm +++ b/crypto/blst_src/build/win64/div3w-x86_64.asm @@ -9,12 +9,14 @@ div_3_limbs PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi - mov rax,rsp + mov r11,rsp $L$SEH_begin_div_3_limbs:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - +$L$SEH_body_div_3_limbs:: mov r8,QWORD PTR[rdi] mov r9,QWORD PTR[8+rdi] @@ -47,9 +49,12 @@ $L$oop:: or rax,rcx +$L$SEH_epilogue_div_3_limbs:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret + $L$SEH_end_div_3_limbs:: div_3_limbs ENDP PUBLIC quot_rem_128 @@ -60,12 +65,14 @@ quot_rem_128 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi - mov rax,rsp + mov r11,rsp $L$SEH_begin_quot_rem_128:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - +$L$SEH_body_quot_rem_128:: mov rax,rdx mov rcx,rdx @@ -101,9 +108,12 @@ $L$SEH_begin_quot_rem_128:: mov rax,rcx +$L$SEH_epilogue_quot_rem_128:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret + $L$SEH_end_quot_rem_128:: quot_rem_128 ENDP @@ -119,12 +129,14 @@ quot_rem_64 PROC PUBLIC DB 243,15,30,250 mov QWORD PTR[8+rsp],rdi ;WIN64 prologue mov QWORD PTR[16+rsp],rsi - mov rax,rsp + mov r11,rsp $L$SEH_begin_quot_rem_64:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - +$L$SEH_body_quot_rem_64:: mov rax,rdx imul rdx,QWORD PTR[rsi] @@ -136,17 +148,110 @@ $L$SEH_begin_quot_rem_64:: mov QWORD PTR[rdi],r10 mov QWORD PTR[8+rdi],rax +$L$SEH_epilogue_quot_rem_64:: mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue mov rsi,QWORD PTR[16+rsp] + DB 0F3h,0C3h ;repret + $L$SEH_end_quot_rem_64:: quot_rem_64 ENDP .text$ ENDS .pdata SEGMENT READONLY ALIGN(4) ALIGN 4 + DD imagerel $L$SEH_begin_div_3_limbs + DD imagerel $L$SEH_body_div_3_limbs + DD imagerel $L$SEH_info_div_3_limbs_prologue + + DD imagerel $L$SEH_body_div_3_limbs + DD imagerel $L$SEH_epilogue_div_3_limbs + DD imagerel $L$SEH_info_div_3_limbs_body + + DD imagerel $L$SEH_epilogue_div_3_limbs + DD imagerel $L$SEH_end_div_3_limbs + DD imagerel $L$SEH_info_div_3_limbs_epilogue + + DD imagerel $L$SEH_begin_quot_rem_128 + DD imagerel $L$SEH_body_quot_rem_128 + DD imagerel $L$SEH_info_quot_rem_128_prologue + + DD imagerel $L$SEH_body_quot_rem_128 + DD imagerel $L$SEH_epilogue_quot_rem_128 + DD imagerel $L$SEH_info_quot_rem_128_body + + DD imagerel $L$SEH_epilogue_quot_rem_128 + DD imagerel $L$SEH_end_quot_rem_128 + DD imagerel $L$SEH_info_quot_rem_128_epilogue + + DD imagerel $L$SEH_begin_quot_rem_64 + DD imagerel $L$SEH_body_quot_rem_64 + DD imagerel $L$SEH_info_quot_rem_64_prologue + + DD imagerel $L$SEH_body_quot_rem_64 + DD imagerel $L$SEH_epilogue_quot_rem_64 + DD imagerel $L$SEH_info_quot_rem_64_body + + DD imagerel $L$SEH_epilogue_quot_rem_64 + DD imagerel $L$SEH_end_quot_rem_64 + DD imagerel $L$SEH_info_quot_rem_64_epilogue + .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 +$L$SEH_info_div_3_limbs_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_div_3_limbs_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_div_3_limbs_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_quot_rem_128_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_quot_rem_128_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_quot_rem_128_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_quot_rem_64_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_quot_rem_64_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_quot_rem_64_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + .xdata ENDS END diff --git a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm index c3bf8634617..6aedca7cdaf 100644 --- a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm +++ b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm @@ -1,4 +1,11 @@ OPTION DOTNAME +EXTERN mul_mont_sparse_256$1:NEAR +EXTERN sqr_mont_sparse_256$1:NEAR +EXTERN from_mont_256$1:NEAR +EXTERN redc_mont_256$1:NEAR +_DATA SEGMENT +COMM __blst_platform_cap:DWORD:1 +_DATA ENDS .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC mul_mont_sparse_256 @@ -11,14 +18,17 @@ mul_mont_sparse_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_mont_sparse_256:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_mont_sparse_256$1 +endif push rbp push rbx @@ -83,13 +93,16 @@ sqr_mont_sparse_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_mont_sparse_256:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_mont_sparse_256$1 +endif push rbp push rbx @@ -148,6 +161,7 @@ sqr_mont_sparse_256 ENDP ALIGN 32 __mulq_mont_sparse_256 PROC PRIVATE DB 243,15,30,250 + mul r14 add r10,rax mov rax,r15 @@ -434,13 +448,16 @@ from_mont_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_from_mont_256:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz from_mont_256$1 +endif push rbp push rbx @@ -516,13 +533,16 @@ redc_mont_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_redc_mont_256:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz redc_mont_256$1 +endif push rbp push rbx @@ -597,6 +617,7 @@ redc_mont_256 ENDP ALIGN 32 __mulq_by_1_mont_256 PROC PRIVATE DB 243,15,30,250 + mov rax,QWORD PTR[rsi] mov r10,QWORD PTR[8+rsi] mov r11,QWORD PTR[16+rsi] @@ -787,8 +808,9 @@ $L$SEH_info_mul_mont_sparse_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_mont_sparse_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -800,7 +822,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_mont_sparse_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -811,8 +834,9 @@ $L$SEH_info_sqr_mont_sparse_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqr_mont_sparse_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -824,7 +848,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_sparse_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -835,8 +860,9 @@ $L$SEH_info_from_mont_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_from_mont_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -848,7 +874,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_from_mont_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -859,8 +886,9 @@ $L$SEH_info_redc_mont_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_redc_mont_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -872,7 +900,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_redc_mont_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm index 0ccb46786c3..8563815917e 100644 --- a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm +++ b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm @@ -1,4 +1,22 @@ OPTION DOTNAME +EXTERN mul_mont_384x$1:NEAR +EXTERN sqr_mont_384x$1:NEAR +EXTERN mul_382x$1:NEAR +EXTERN sqr_382x$1:NEAR +EXTERN mul_384$1:NEAR +EXTERN sqr_384$1:NEAR +EXTERN redc_mont_384$1:NEAR +EXTERN from_mont_384$1:NEAR +EXTERN sgn0_pty_mont_384$1:NEAR +EXTERN sgn0_pty_mont_384x$1:NEAR +EXTERN mul_mont_384$1:NEAR +EXTERN sqr_mont_384$1:NEAR +EXTERN sqr_n_mul_mont_384$1:NEAR +EXTERN sqr_n_mul_mont_383$1:NEAR +EXTERN sqr_mont_382x$1:NEAR +_DATA SEGMENT +COMM __blst_platform_cap:DWORD:1 +_DATA ENDS .text$ SEGMENT ALIGN(256) 'CODE' @@ -9,8 +27,9 @@ OPTION DOTNAME ALIGN 32 -__sub_mod_384x384 PROC PRIVATE +__subq_mod_384x384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -71,12 +90,13 @@ __sub_mod_384x384 PROC PRIVATE mov QWORD PTR[88+rdi],rsi DB 0F3h,0C3h ;repret -__sub_mod_384x384 ENDP +__subq_mod_384x384 ENDP ALIGN 32 -__add_mod_384 PROC PRIVATE +__addq_mod_384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -120,12 +140,13 @@ __add_mod_384 PROC PRIVATE mov QWORD PTR[40+rdi],r13 DB 0F3h,0C3h ;repret -__add_mod_384 ENDP +__addq_mod_384 ENDP ALIGN 32 -__sub_mod_384 PROC PRIVATE +__subq_mod_384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -133,7 +154,7 @@ __sub_mod_384 PROC PRIVATE mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] -__sub_mod_384_a_is_loaded:: +__subq_mod_384_a_is_loaded:: sub r8,QWORD PTR[rdx] mov r14,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rdx] @@ -169,7 +190,7 @@ __sub_mod_384_a_is_loaded:: mov QWORD PTR[40+rdi],r13 DB 0F3h,0C3h ;repret -__sub_mod_384 ENDP +__subq_mod_384 ENDP PUBLIC mul_mont_384x @@ -180,14 +201,17 @@ mul_mont_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_mont_384x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_mont_384x$1 +endif push rbp push rbx @@ -228,12 +252,12 @@ $L$SEH_body_mul_mont_384x:: mov rcx,QWORD PTR[8+rsp] lea rdx,QWORD PTR[((-48))+rsi] lea rdi,QWORD PTR[((40+192+48))+rsp] - call __add_mod_384 + call __addq_mod_384 mov rsi,QWORD PTR[16+rsp] lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((-48))+rdi] - call __add_mod_384 + call __addq_mod_384 lea rbx,QWORD PTR[rdi] lea rsi,QWORD PTR[48+rdi] @@ -243,17 +267,17 @@ $L$SEH_body_mul_mont_384x:: lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[40+rsp] mov rcx,QWORD PTR[8+rsp] - call __sub_mod_384x384 + call __subq_mod_384x384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[((-96))+rdi] - call __sub_mod_384x384 + call __subq_mod_384x384 lea rsi,QWORD PTR[40+rsp] lea rdx,QWORD PTR[((40+96))+rsp] lea rdi,QWORD PTR[40+rsp] - call __sub_mod_384x384 + call __subq_mod_384x384 mov rbx,rcx @@ -262,14 +286,14 @@ $L$SEH_body_mul_mont_384x:: mov rcx,QWORD PTR[rsp] mov rdi,QWORD PTR[32+rsp] call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 lea rsi,QWORD PTR[((40+192))+rsp] mov rcx,QWORD PTR[rsp] lea rdi,QWORD PTR[48+rdi] call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 lea r8,QWORD PTR[328+rsp] mov r15,QWORD PTR[r8] @@ -304,13 +328,16 @@ sqr_mont_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_mont_384x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_mont_384x$1 +endif push rbp push rbx @@ -336,13 +363,13 @@ $L$SEH_body_sqr_mont_384x:: lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[32+rsp] - call __add_mod_384 + call __addq_mod_384 mov rsi,QWORD PTR[16+rsp] lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((32+48))+rsp] - call __sub_mod_384 + call __subq_mod_384 mov rsi,QWORD PTR[16+rsp] @@ -435,13 +462,16 @@ mul_382x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_382x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_382x$1 +endif push rbp push rbx @@ -531,18 +561,18 @@ $L$SEH_body_mul_382x:: lea rdx,QWORD PTR[32+rsp] mov rcx,QWORD PTR[24+rsp] mov rdi,rsi - call __sub_mod_384x384 + call __subq_mod_384x384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[((-96))+rdi] - call __sub_mod_384x384 + call __subq_mod_384x384 lea rsi,QWORD PTR[((-96))+rdi] lea rdx,QWORD PTR[32+rsp] lea rdi,QWORD PTR[((-96))+rdi] - call __sub_mod_384x384 + call __subq_mod_384x384 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[r8] @@ -577,12 +607,15 @@ sqr_382x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_382x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_382x$1 +endif push rbp push rbx @@ -633,7 +666,7 @@ $L$SEH_body_sqr_382x:: lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[48+rdi] - call __sub_mod_384_a_is_loaded + call __subq_mod_384_a_is_loaded lea rsi,QWORD PTR[rdi] @@ -716,12 +749,15 @@ mul_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_384$1 +endif push rbp push rbx @@ -755,6 +791,7 @@ mul_384 ENDP ALIGN 32 __mulq_384 PROC PRIVATE DB 243,15,30,250 + mov rax,QWORD PTR[rbx] mov rbp,rax @@ -1046,11 +1083,14 @@ sqr_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_384:: - mov rdi,rcx - mov rsi,rdx - + mov rdi,rcx + mov rsi,rdx +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_384$1 +endif push rbp push rbx @@ -1097,6 +1137,7 @@ sqr_384 ENDP ALIGN 32 __sqrq_384 PROC PRIVATE DB 243,15,30,250 + mov rax,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] mov rcx,QWORD PTR[16+rsi] @@ -1294,13 +1335,16 @@ sqr_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_mont_384$1 +endif push rbp push rbx @@ -1330,7 +1374,7 @@ $L$SEH_body_sqr_mont_384:: mov rbx,QWORD PTR[104+rsp] mov rdi,QWORD PTR[112+rsp] call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 lea r8,QWORD PTR[120+rsp] mov r15,QWORD PTR[120+rsp] @@ -1368,13 +1412,16 @@ redc_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_redc_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz redc_mont_384$1 +endif push rbp push rbx @@ -1394,7 +1441,7 @@ $L$SEH_body_redc_mont_384:: mov rbx,rdx call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 mov r15,QWORD PTR[8+rsp] @@ -1432,13 +1479,16 @@ from_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_from_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz from_mont_384$1 +endif push rbp push rbx @@ -1515,6 +1565,7 @@ from_mont_384 ENDP ALIGN 32 __mulq_by_1_mont_384 PROC PRIVATE DB 243,15,30,250 + mov rax,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -1810,8 +1861,9 @@ __mulq_by_1_mont_384 ENDP ALIGN 32 -__redc_tail_mont_384 PROC PRIVATE +__redq_tail_mont_384 PROC PRIVATE DB 243,15,30,250 + add r14,QWORD PTR[48+rsi] mov rax,r14 adc r15,QWORD PTR[56+rsi] @@ -1852,7 +1904,7 @@ __redc_tail_mont_384 PROC PRIVATE mov QWORD PTR[40+rdi],r11 DB 0F3h,0C3h ;repret -__redc_tail_mont_384 ENDP +__redq_tail_mont_384 ENDP PUBLIC sgn0_pty_mont_384 @@ -1864,12 +1916,15 @@ sgn0_pty_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0_pty_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sgn0_pty_mont_384$1 +endif push rbp push rbx @@ -1948,12 +2003,15 @@ sgn0_pty_mont_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0_pty_mont_384x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sgn0_pty_mont_384x$1 +endif push rbp push rbx @@ -2081,14 +2139,17 @@ mul_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mul_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_mont_384$1 +endif push rbp push rbx @@ -2143,6 +2204,7 @@ mul_mont_384 ENDP ALIGN 32 __mulq_mont_384 PROC PRIVATE DB 243,15,30,250 + mov rdi,rax mul r14 mov r8,rax @@ -2750,15 +2812,18 @@ sqr_n_mul_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_n_mul_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mov r9,QWORD PTR[48+rsp] - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_n_mul_mont_384$1 +endif push rbp push rbx @@ -2792,7 +2857,7 @@ $L$oop_sqr_384:: mov rcx,QWORD PTR[rsp] mov rbx,QWORD PTR[16+rsp] call __mulq_by_1_mont_384 - call __redc_tail_mont_384 + call __redq_tail_mont_384 movd edx,xmm1 lea rsi,QWORD PTR[rdi] @@ -2847,15 +2912,18 @@ sqr_n_mul_mont_383 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_n_mul_mont_383:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mov r9,QWORD PTR[48+rsp] - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_n_mul_mont_383$1 +endif push rbp push rbx @@ -2950,6 +3018,7 @@ sqr_n_mul_mont_383 ENDP ALIGN 32 __mulq_mont_383_nonred PROC PRIVATE DB 243,15,30,250 + mov rbp,rax mul r14 mov r8,rax @@ -3514,13 +3583,16 @@ sqr_mont_382x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqr_mont_382x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_mont_382x$1 +endif push rbp push rbx @@ -3882,8 +3954,9 @@ $L$SEH_info_mul_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_mont_384x_body:: DB 1,0,18,0 DB 000h,0f4h,029h,000h @@ -3895,6 +3968,8 @@ DB 000h,054h,02eh,000h DB 000h,074h,030h,000h DB 000h,064h,031h,000h DB 000h,001h,02fh,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3905,8 +3980,9 @@ $L$SEH_info_sqr_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqr_mont_384x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h @@ -3918,6 +3994,8 @@ DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3928,8 +4006,9 @@ $L$SEH_info_mul_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_382x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h @@ -3941,6 +4020,8 @@ DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3951,8 +4032,9 @@ $L$SEH_info_sqr_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqr_382x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -3964,7 +4046,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqr_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3975,8 +4058,9 @@ $L$SEH_info_mul_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_384_body:: DB 1,0,11,0 DB 000h,0c4h,000h,000h @@ -3996,8 +4080,9 @@ $L$SEH_info_sqr_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqr_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -4009,7 +4094,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqr_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4020,8 +4106,9 @@ $L$SEH_info_sqr_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqr_mont_384_body:: DB 1,0,18,0 DB 000h,0f4h,00fh,000h @@ -4033,6 +4120,8 @@ DB 000h,054h,014h,000h DB 000h,074h,016h,000h DB 000h,064h,017h,000h DB 000h,001h,015h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4043,8 +4132,9 @@ $L$SEH_info_redc_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_redc_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -4056,7 +4146,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_redc_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4067,8 +4158,9 @@ $L$SEH_info_from_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_from_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -4080,7 +4172,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_from_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4091,8 +4184,9 @@ $L$SEH_info_sgn0_pty_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sgn0_pty_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -4104,7 +4198,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4115,8 +4210,9 @@ $L$SEH_info_sgn0_pty_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sgn0_pty_mont_384x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -4128,7 +4224,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sgn0_pty_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4139,8 +4236,9 @@ $L$SEH_info_mul_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mul_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h @@ -4152,7 +4250,8 @@ DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mul_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4163,8 +4262,9 @@ $L$SEH_info_sqr_n_mul_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqr_n_mul_mont_384_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h @@ -4176,6 +4276,8 @@ DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqr_n_mul_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4186,8 +4288,9 @@ $L$SEH_info_sqr_n_mul_mont_383_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqr_n_mul_mont_383_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h @@ -4199,6 +4302,8 @@ DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqr_n_mul_mont_383_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -4209,8 +4314,9 @@ $L$SEH_info_sqr_mont_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqr_mont_382x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h @@ -4222,6 +4328,8 @@ DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqr_mont_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm index 83534c629e9..21d18a8b40b 100644 --- a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm +++ b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm @@ -1,4 +1,8 @@ OPTION DOTNAME +PUBLIC mul_mont_sparse_256$1 +PUBLIC sqr_mont_sparse_256$1 +PUBLIC from_mont_256$1 +PUBLIC redc_mont_256$1 .text$ SEGMENT ALIGN(256) 'CODE' PUBLIC mulx_mont_sparse_256 @@ -11,14 +15,14 @@ mulx_mont_sparse_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_mont_sparse_256:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] - - - +mul_mont_sparse_256$1:: push rbp push rbx @@ -81,13 +85,13 @@ sqrx_mont_sparse_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_mont_sparse_256:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +sqr_mont_sparse_256$1:: push rbp push rbx @@ -144,6 +148,7 @@ sqrx_mont_sparse_256 ENDP ALIGN 32 __mulx_mont_sparse_256 PROC PRIVATE DB 243,15,30,250 + mulx r12,r15,r15 mulx r13,rbp,rbp add r11,r15 @@ -346,13 +351,13 @@ fromx_mont_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_fromx_mont_256:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +from_mont_256$1:: push rbp push rbx @@ -428,13 +433,13 @@ redcx_mont_256 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_redcx_mont_256:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +redc_mont_256$1:: push rbp push rbx @@ -509,6 +514,7 @@ redcx_mont_256 ENDP ALIGN 32 __mulx_by_1_mont_256 PROC PRIVATE DB 243,15,30,250 + mov rax,QWORD PTR[rsi] mov r11,QWORD PTR[8+rsi] mov r12,QWORD PTR[16+rsi] @@ -699,8 +705,9 @@ $L$SEH_info_mulx_mont_sparse_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mulx_mont_sparse_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -712,7 +719,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mulx_mont_sparse_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -723,8 +731,9 @@ $L$SEH_info_sqrx_mont_sparse_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqrx_mont_sparse_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -736,7 +745,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_sparse_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -747,8 +757,9 @@ $L$SEH_info_fromx_mont_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_fromx_mont_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -760,7 +771,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_fromx_mont_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -771,8 +783,9 @@ $L$SEH_info_redcx_mont_256_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_redcx_mont_256_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -784,7 +797,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_redcx_mont_256_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm index 25bee97731b..4dc41b04098 100644 --- a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm +++ b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm @@ -1,4 +1,19 @@ OPTION DOTNAME +PUBLIC mul_mont_384x$1 +PUBLIC sqr_mont_384x$1 +PUBLIC mul_382x$1 +PUBLIC sqr_382x$1 +PUBLIC mul_384$1 +PUBLIC sqr_384$1 +PUBLIC redc_mont_384$1 +PUBLIC from_mont_384$1 +PUBLIC sgn0_pty_mont_384$1 +PUBLIC sgn0_pty_mont_384x$1 +PUBLIC mul_mont_384$1 +PUBLIC sqr_mont_384$1 +PUBLIC sqr_n_mul_mont_384$1 +PUBLIC sqr_n_mul_mont_383$1 +PUBLIC sqr_mont_382x$1 .text$ SEGMENT ALIGN(256) 'CODE' @@ -9,8 +24,9 @@ OPTION DOTNAME ALIGN 32 -__sub_mod_384x384 PROC PRIVATE +__subx_mod_384x384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -71,12 +87,13 @@ __sub_mod_384x384 PROC PRIVATE mov QWORD PTR[88+rdi],rsi DB 0F3h,0C3h ;repret -__sub_mod_384x384 ENDP +__subx_mod_384x384 ENDP ALIGN 32 -__add_mod_384 PROC PRIVATE +__addx_mod_384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -120,12 +137,13 @@ __add_mod_384 PROC PRIVATE mov QWORD PTR[40+rdi],r13 DB 0F3h,0C3h ;repret -__add_mod_384 ENDP +__addx_mod_384 ENDP ALIGN 32 -__sub_mod_384 PROC PRIVATE +__subx_mod_384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov r9,QWORD PTR[8+rsi] mov r10,QWORD PTR[16+rsi] @@ -133,7 +151,7 @@ __sub_mod_384 PROC PRIVATE mov r12,QWORD PTR[32+rsi] mov r13,QWORD PTR[40+rsi] -__sub_mod_384_a_is_loaded:: +__subx_mod_384_a_is_loaded:: sub r8,QWORD PTR[rdx] mov r14,QWORD PTR[rcx] sbb r9,QWORD PTR[8+rdx] @@ -169,7 +187,7 @@ __sub_mod_384_a_is_loaded:: mov QWORD PTR[40+rdi],r13 DB 0F3h,0C3h ;repret -__sub_mod_384 ENDP +__subx_mod_384 ENDP PUBLIC mulx_mont_384x @@ -180,14 +198,14 @@ mulx_mont_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_mont_384x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] - - - +mul_mont_384x$1:: push rbp push rbx @@ -229,12 +247,12 @@ $L$SEH_body_mulx_mont_384x:: lea rsi,QWORD PTR[rbx] lea rdx,QWORD PTR[((-48))+rbx] lea rdi,QWORD PTR[((40+192+48))+rsp] - call __add_mod_384 + call __addx_mod_384 mov rsi,QWORD PTR[24+rsp] lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((-48))+rdi] - call __add_mod_384 + call __addx_mod_384 lea rbx,QWORD PTR[rdi] lea rsi,QWORD PTR[48+rdi] @@ -244,17 +262,17 @@ $L$SEH_body_mulx_mont_384x:: lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[40+rsp] mov rcx,QWORD PTR[8+rsp] - call __sub_mod_384x384 + call __subx_mod_384x384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[((-96))+rdi] - call __sub_mod_384x384 + call __subx_mod_384x384 lea rsi,QWORD PTR[40+rsp] lea rdx,QWORD PTR[((40+96))+rsp] lea rdi,QWORD PTR[40+rsp] - call __sub_mod_384x384 + call __subx_mod_384x384 lea rbx,QWORD PTR[rcx] @@ -263,14 +281,14 @@ $L$SEH_body_mulx_mont_384x:: mov rcx,QWORD PTR[rsp] mov rdi,QWORD PTR[32+rsp] call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 lea rsi,QWORD PTR[((40+192))+rsp] mov rcx,QWORD PTR[rsp] lea rdi,QWORD PTR[48+rdi] call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 lea r8,QWORD PTR[328+rsp] mov r15,QWORD PTR[r8] @@ -305,13 +323,13 @@ sqrx_mont_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_mont_384x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +sqr_mont_384x$1:: push rbp push rbx @@ -338,13 +356,13 @@ $L$SEH_body_sqrx_mont_384x:: lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[32+rsp] - call __add_mod_384 + call __addx_mod_384 mov rsi,QWORD PTR[24+rsp] lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[((32+48))+rsp] - call __sub_mod_384 + call __subx_mod_384 mov rsi,QWORD PTR[24+rsp] @@ -447,13 +465,13 @@ mulx_382x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_382x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +mul_382x$1:: push rbp push rbx @@ -543,18 +561,18 @@ $L$SEH_body_mulx_382x:: lea rdx,QWORD PTR[32+rsp] mov rcx,QWORD PTR[24+rsp] mov rdi,rsi - call __sub_mod_384x384 + call __subx_mod_384x384 lea rsi,QWORD PTR[rdi] lea rdx,QWORD PTR[((-96))+rdi] - call __sub_mod_384x384 + call __subx_mod_384x384 lea rsi,QWORD PTR[((-96))+rdi] lea rdx,QWORD PTR[32+rsp] lea rdi,QWORD PTR[((-96))+rdi] - call __sub_mod_384x384 + call __subx_mod_384x384 lea r8,QWORD PTR[136+rsp] mov r15,QWORD PTR[r8] @@ -589,12 +607,12 @@ sqrx_382x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_382x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - - - +sqr_382x$1:: push rbp push rbx @@ -645,7 +663,7 @@ $L$SEH_body_sqrx_382x:: lea rdx,QWORD PTR[48+rsi] lea rdi,QWORD PTR[48+rdi] - call __sub_mod_384_a_is_loaded + call __subx_mod_384_a_is_loaded lea rsi,QWORD PTR[rdi] @@ -728,12 +746,12 @@ mulx_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - - - +mul_384$1:: push rbp push rbx @@ -779,6 +797,7 @@ mulx_384 ENDP ALIGN 32 __mulx_384 PROC PRIVATE DB 243,15,30,250 + mov rdx,QWORD PTR[rbx] mov r14,QWORD PTR[rsi] mov r15,QWORD PTR[8+rsi] @@ -957,11 +976,11 @@ sqrx_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_384:: - mov rdi,rcx - mov rsi,rdx - + mov rdi,rcx + mov rsi,rdx +sqr_384$1:: push rbp push rbx @@ -1007,6 +1026,7 @@ sqrx_384 ENDP ALIGN 32 __sqrx_384 PROC PRIVATE DB 243,15,30,250 + mov rdx,QWORD PTR[rsi] mov r14,QWORD PTR[8+rsi] mov r15,QWORD PTR[16+rsi] @@ -1153,13 +1173,13 @@ redcx_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_redcx_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +redc_mont_384$1:: push rbp push rbx @@ -1179,7 +1199,7 @@ $L$SEH_body_redcx_mont_384:: mov rbx,rdx call __mulx_by_1_mont_384 - call __redc_tail_mont_384 + call __redx_tail_mont_384 mov r15,QWORD PTR[8+rsp] @@ -1217,13 +1237,13 @@ fromx_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_fromx_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +from_mont_384$1:: push rbp push rbx @@ -1300,6 +1320,7 @@ fromx_mont_384 ENDP ALIGN 32 __mulx_by_1_mont_384 PROC PRIVATE DB 243,15,30,250 + mov r8,QWORD PTR[rsi] mov rdx,rcx mov r9,QWORD PTR[8+rsi] @@ -1486,8 +1507,9 @@ __mulx_by_1_mont_384 ENDP ALIGN 32 -__redc_tail_mont_384 PROC PRIVATE +__redx_tail_mont_384 PROC PRIVATE DB 243,15,30,250 + add r14,QWORD PTR[48+rsi] mov rax,r14 adc r15,QWORD PTR[56+rsi] @@ -1528,7 +1550,7 @@ __redc_tail_mont_384 PROC PRIVATE mov QWORD PTR[40+rdi],r11 DB 0F3h,0C3h ;repret -__redc_tail_mont_384 ENDP +__redx_tail_mont_384 ENDP PUBLIC sgn0x_pty_mont_384 @@ -1540,12 +1562,12 @@ sgn0x_pty_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0x_pty_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - - - +sgn0_pty_mont_384$1:: push rbp push rbx @@ -1624,12 +1646,12 @@ sgn0x_pty_mont_384x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sgn0x_pty_mont_384x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 - - - +sgn0_pty_mont_384x$1:: push rbp push rbx @@ -1757,14 +1779,14 @@ mulx_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_mulx_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] - - - +mul_mont_384$1:: push rbp push rbx @@ -1825,6 +1847,7 @@ ALIGN 32 __mulx_mont_384 PROC PRIVATE DB 243,15,30,250 + mulx r10,r14,r15 mulx r11,r15,rax add r9,r14 @@ -2230,13 +2253,13 @@ sqrx_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +sqr_mont_384$1:: push rbp push rbx @@ -2304,15 +2327,15 @@ sqrx_n_mul_mont_384 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_n_mul_mont_384:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mov r9,QWORD PTR[48+rsp] - - - +sqr_n_mul_mont_384$1:: push rbp push rbx @@ -2398,15 +2421,15 @@ sqrx_n_mul_mont_383 PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_n_mul_mont_383:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8,QWORD PTR[40+rsp] mov r9,QWORD PTR[48+rsp] - - - +sqr_n_mul_mont_383$1:: push rbp push rbx @@ -2485,6 +2508,7 @@ ALIGN 32 __mulx_mont_383_nonred PROC PRIVATE DB 243,15,30,250 + mulx r10,r14,r15 mulx r11,r15,rax add r9,r14 @@ -2851,13 +2875,13 @@ sqrx_mont_382x PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_sqrx_mont_382x:: + + mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 - - - +sqr_mont_382x$1:: push rbp push rbx @@ -3229,8 +3253,9 @@ $L$SEH_info_mulx_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mulx_mont_384x_body:: DB 1,0,18,0 DB 000h,0f4h,029h,000h @@ -3242,6 +3267,8 @@ DB 000h,054h,02eh,000h DB 000h,074h,030h,000h DB 000h,064h,031h,000h DB 000h,001h,02fh,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mulx_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3252,8 +3279,9 @@ $L$SEH_info_sqrx_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqrx_mont_384x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h @@ -3265,6 +3293,8 @@ DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3275,8 +3305,9 @@ $L$SEH_info_mulx_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mulx_382x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h @@ -3288,6 +3319,8 @@ DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mulx_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3298,8 +3331,9 @@ $L$SEH_info_sqrx_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqrx_382x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -3311,7 +3345,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqrx_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3322,8 +3357,9 @@ $L$SEH_info_mulx_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mulx_384_body:: DB 1,0,17,0 DB 000h,0f4h,000h,000h @@ -3335,7 +3371,8 @@ DB 000h,054h,005h,000h DB 000h,074h,007h,000h DB 000h,064h,008h,000h DB 000h,052h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mulx_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3346,8 +3383,9 @@ $L$SEH_info_sqrx_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqrx_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -3359,7 +3397,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqrx_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3370,8 +3409,9 @@ $L$SEH_info_redcx_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_redcx_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -3383,7 +3423,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_redcx_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3394,8 +3435,9 @@ $L$SEH_info_fromx_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_fromx_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -3407,7 +3449,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_fromx_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3418,8 +3461,9 @@ $L$SEH_info_sgn0x_pty_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sgn0x_pty_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -3431,7 +3475,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sgn0x_pty_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3442,8 +3487,9 @@ $L$SEH_info_sgn0x_pty_mont_384x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sgn0x_pty_mont_384x_body:: DB 1,0,17,0 DB 000h,0f4h,001h,000h @@ -3455,7 +3501,8 @@ DB 000h,054h,006h,000h DB 000h,074h,008h,000h DB 000h,064h,009h,000h DB 000h,062h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sgn0x_pty_mont_384x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3466,8 +3513,9 @@ $L$SEH_info_mulx_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_mulx_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h @@ -3479,7 +3527,8 @@ DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_mulx_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3490,8 +3539,9 @@ $L$SEH_info_sqrx_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqrx_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,003h,000h @@ -3503,7 +3553,8 @@ DB 000h,054h,008h,000h DB 000h,074h,00ah,000h DB 000h,064h,00bh,000h DB 000h,082h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3514,8 +3565,9 @@ $L$SEH_info_sqrx_n_mul_mont_384_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqrx_n_mul_mont_384_body:: DB 1,0,17,0 DB 000h,0f4h,005h,000h @@ -3527,7 +3579,8 @@ DB 000h,054h,00ah,000h DB 000h,074h,00ch,000h DB 000h,064h,00dh,000h DB 000h,0a2h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqrx_n_mul_mont_384_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3538,8 +3591,9 @@ $L$SEH_info_sqrx_n_mul_mont_383_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqrx_n_mul_mont_383_body:: DB 1,0,17,0 DB 000h,0f4h,005h,000h @@ -3551,7 +3605,8 @@ DB 000h,054h,00ah,000h DB 000h,074h,00ch,000h DB 000h,064h,00dh,000h DB 000h,0a2h -DB 000h,000h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqrx_n_mul_mont_383_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h @@ -3562,8 +3617,9 @@ $L$SEH_info_sqrx_mont_382x_prologue:: DB 1,0,5,00bh DB 0,074h,1,0 DB 0,064h,2,0 -DB 0,003h +DB 0,0b3h DB 0,0 + DD 0,0 $L$SEH_info_sqrx_mont_382x_body:: DB 1,0,18,0 DB 000h,0f4h,011h,000h @@ -3575,6 +3631,8 @@ DB 000h,054h,016h,000h DB 000h,074h,018h,000h DB 000h,064h,019h,000h DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_sqrx_mont_382x_epilogue:: DB 1,0,4,0 DB 000h,074h,001h,000h diff --git a/crypto/blst_src/build/win64/sha256-armv8.asm b/crypto/blst_src/build/win64/sha256-armv8.asm index 0e0c54cb65b..31e74219c19 100644 --- a/crypto/blst_src/build/win64/sha256-armv8.asm +++ b/crypto/blst_src/build/win64/sha256-armv8.asm @@ -10,11 +10,12 @@ // // sha256_block procedure for ARMv8. // -// This module is stripped of scalar code paths, with raionale that all +// This module is stripped of scalar code paths, with rationale that all // known processors are NEON-capable. // // See original module at CRYPTOGAMS for further details. + COMMON |__blst_platform_cap|,4 AREA |.text|,CODE,ALIGN=8,ARM64 ALIGN 64 @@ -184,6 +185,11 @@ EXPORT |blst_sha256_block_data_order|[FUNC] ALIGN 16 |blst_sha256_block_data_order| PROC + adrp x16,__blst_platform_cap + ldr w16,[x16,__blst_platform_cap] + tst w16,#1 + bne |$Lv8_entry| + stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 diff --git a/crypto/blst_src/build/win64/sha256-x86_64.asm b/crypto/blst_src/build/win64/sha256-x86_64.asm index d3b409235e7..a502a75ecaf 100644 --- a/crypto/blst_src/build/win64/sha256-x86_64.asm +++ b/crypto/blst_src/build/win64/sha256-x86_64.asm @@ -1,4 +1,7 @@ OPTION DOTNAME +_DATA SEGMENT +COMM __blst_platform_cap:DWORD:1 +_DATA ENDS .text$ SEGMENT ALIGN(256) 'CODE' ALIGN 64 @@ -38,23 +41,23 @@ blst_sha256_block_data_order_shaext PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_blst_sha256_block_data_order_shaext:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - - sub rsp,058h - movaps XMMWORD PTR[(-88)+r11],xmm6 - movaps XMMWORD PTR[(-72)+r11],xmm7 + push rbp - movaps XMMWORD PTR[(-56)+r11],xmm8 + mov rbp,rsp - movaps XMMWORD PTR[(-40)+r11],xmm9 + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +$L$blst_sha256_block_data_order$2:: + sub rsp,050h - movaps XMMWORD PTR[(-24)+r11],xmm10 + movaps XMMWORD PTR[(-80)+rbp],xmm6 + movaps XMMWORD PTR[(-64)+rbp],xmm7 + movaps XMMWORD PTR[(-48)+rbp],xmm8 + movaps XMMWORD PTR[(-32)+rbp],xmm9 + movaps XMMWORD PTR[(-16)+rbp],xmm10 $L$SEH_body_blst_sha256_block_data_order_shaext:: @@ -259,16 +262,18 @@ DB 102,15,58,15,215,8 movdqu XMMWORD PTR[rdi],xmm1 movdqu XMMWORD PTR[16+rdi],xmm2 - movaps xmm6,XMMWORD PTR[((-88))+r11] - movaps xmm7,XMMWORD PTR[((-72))+r11] - movaps xmm8,XMMWORD PTR[((-56))+r11] - movaps xmm9,XMMWORD PTR[((-40))+r11] - movaps xmm10,XMMWORD PTR[((-24))+r11] - mov rsp,r11 + movaps xmm6,XMMWORD PTR[((-80))+rbp] + movaps xmm7,XMMWORD PTR[((-64))+rbp] + movaps xmm8,XMMWORD PTR[((-48))+rbp] + movaps xmm9,XMMWORD PTR[((-32))+rbp] + movaps xmm10,XMMWORD PTR[((-16))+rbp] + mov rsp,rbp + + pop rbp $L$SEH_epilogue_blst_sha256_block_data_order_shaext:: - mov rdi,QWORD PTR[8+r11] ;WIN64 epilogue - mov rsi,QWORD PTR[16+r11] + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] DB 0F3h,0C3h ;repret @@ -284,14 +289,17 @@ blst_sha256_block_data_order PROC PUBLIC mov QWORD PTR[16+rsp],rsi mov r11,rsp $L$SEH_begin_blst_sha256_block_data_order:: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbp + mov rbp,rsp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + test DWORD PTR[__blst_platform_cap],2 + jnz $L$blst_sha256_block_data_order$2 push rbx push r12 @@ -303,21 +311,16 @@ $L$SEH_begin_blst_sha256_block_data_order:: push r15 shl rdx,4 - sub rsp,104 + sub rsp,88 lea rdx,QWORD PTR[rdx*4+rsi] - mov QWORD PTR[rsp],rdi + mov QWORD PTR[((-64))+rbp],rdi - mov QWORD PTR[16+rsp],rdx - movaps XMMWORD PTR[32+rsp],xmm6 - - movaps XMMWORD PTR[48+rsp],xmm7 - - movaps XMMWORD PTR[64+rsp],xmm8 - - movaps XMMWORD PTR[80+rsp],xmm9 - - mov rbp,rsp + mov QWORD PTR[((-48))+rbp],rdx + movaps XMMWORD PTR[(-128)+rbp],xmm6 + movaps XMMWORD PTR[(-112)+rbp],xmm7 + movaps XMMWORD PTR[(-96)+rbp],xmm8 + movaps XMMWORD PTR[(-80)+rbp],xmm9 $L$SEH_body_blst_sha256_block_data_order:: @@ -338,7 +341,7 @@ $L$SEH_body_blst_sha256_block_data_order:: ALIGN 16 $L$loop_ssse3:: movdqa xmm7,XMMWORD PTR[((K256+256))] - mov QWORD PTR[8+rbp],rsi + mov QWORD PTR[((-56))+rbp],rsi movdqu xmm0,XMMWORD PTR[rsi] movdqu xmm1,XMMWORD PTR[16+rsi] movdqu xmm2,XMMWORD PTR[32+rsi] @@ -1363,9 +1366,9 @@ DB 102,15,58,15,249,4 add eax,r15d mov r13d,r8d add r14d,eax - mov rdi,QWORD PTR[rbp] + mov rdi,QWORD PTR[((-64))+rbp] mov eax,r14d - mov rsi,QWORD PTR[8+rbp] + mov rsi,QWORD PTR[((-56))+rbp] add eax,DWORD PTR[rdi] add ebx,DWORD PTR[4+rdi] @@ -1377,7 +1380,7 @@ DB 102,15,58,15,249,4 add r11d,DWORD PTR[28+rdi] lea rsi,QWORD PTR[64+rsi] - cmp rsi,QWORD PTR[16+rbp] + cmp rsi,QWORD PTR[((-48))+rbp] mov DWORD PTR[rdi],eax mov DWORD PTR[4+rdi],ebx @@ -1390,33 +1393,27 @@ DB 102,15,58,15,249,4 jb $L$loop_ssse3 xorps xmm0,xmm0 - lea r11,QWORD PTR[((104+48))+rbp] - movaps XMMWORD PTR[rsp],xmm0 movaps XMMWORD PTR[16+rsp],xmm0 movaps XMMWORD PTR[32+rsp],xmm0 movaps XMMWORD PTR[48+rsp],xmm0 - movaps xmm6,XMMWORD PTR[32+rbp] - movaps xmm7,XMMWORD PTR[48+rbp] - movaps xmm8,XMMWORD PTR[64+rbp] - movaps xmm9,XMMWORD PTR[80+rbp] - mov r15,QWORD PTR[104+rbp] - - mov r14,QWORD PTR[((-40))+r11] - - mov r13,QWORD PTR[((-32))+r11] - - mov r12,QWORD PTR[((-24))+r11] - - mov rbx,QWORD PTR[((-16))+r11] - - mov rbp,QWORD PTR[((-8))+r11] + movaps xmm6,XMMWORD PTR[((-128))+rbp] + movaps xmm7,XMMWORD PTR[((-112))+rbp] + movaps xmm8,XMMWORD PTR[((-96))+rbp] + movaps xmm9,XMMWORD PTR[((-80))+rbp] + mov r15,QWORD PTR[((-40))+rbp] + mov r14,QWORD PTR[((-32))+rbp] + mov r13,QWORD PTR[((-24))+rbp] + mov r12,QWORD PTR[((-16))+rbp] + mov rbx,QWORD PTR[((-8))+rbp] + mov rsp,rbp + + pop rbp $L$SEH_epilogue_blst_sha256_block_data_order:: - mov rdi,QWORD PTR[8+r11] ;WIN64 epilogue - mov rsi,QWORD PTR[16+r11] + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] - lea rsp,QWORD PTR[r11] DB 0F3h,0C3h ;repret $L$SEH_end_blst_sha256_block_data_order:: @@ -1427,6 +1424,7 @@ PUBLIC blst_sha256_emit ALIGN 16 blst_sha256_emit PROC PUBLIC DB 243,15,30,250 + mov r8,QWORD PTR[rdx] mov r9,QWORD PTR[8+rdx] mov r10,QWORD PTR[16+rdx] @@ -1456,6 +1454,7 @@ PUBLIC blst_sha256_bcopy ALIGN 16 blst_sha256_bcopy PROC PUBLIC DB 243,15,30,250 + sub rcx,rdx $L$oop_bcopy:: movzx eax,BYTE PTR[rdx] @@ -1472,6 +1471,7 @@ PUBLIC blst_sha256_hcopy ALIGN 16 blst_sha256_hcopy PROC PUBLIC DB 243,15,30,250 + mov r8,QWORD PTR[rdx] mov r9,QWORD PTR[8+rdx] mov r10,QWORD PTR[16+rdx] @@ -1513,13 +1513,14 @@ ALIGN 4 .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 $L$SEH_info_blst_sha256_block_data_order_shaext_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,003h -DB 0,0 +DB 1,4,6,005h +DB 4,074h,2,0 +DB 4,064h,3,0 +DB 4,053h +DB 1,050h + DD 0,0 $L$SEH_info_blst_sha256_block_data_order_shaext_body:: -DB 1,0,15,0 +DB 1,0,17,85 DB 000h,068h,000h,000h DB 000h,078h,001h,000h DB 000h,088h,002h,000h @@ -1527,43 +1528,47 @@ DB 000h,098h,003h,000h DB 000h,0a8h,004h,000h DB 000h,074h,00ch,000h DB 000h,064h,00dh,000h -DB 000h,0a2h +DB 000h,053h +DB 000h,092h +DB 000h,050h DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue:: -DB 1,0,5,11 +DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h -DB 000h,003h -DB 000h,000h +DB 000h,000h,000h,000h $L$SEH_info_blst_sha256_block_data_order_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,003h -DB 0,0 +DB 1,4,6,005h +DB 4,074h,2,0 +DB 4,064h,3,0 +DB 4,053h +DB 1,050h + DD 0,0 $L$SEH_info_blst_sha256_block_data_order_body:: -DB 1,0,26,5 -DB 000h,068h,002h,000h -DB 000h,078h,003h,000h -DB 000h,088h,004h,000h -DB 000h,098h,005h,000h -DB 000h,0f4h,00dh,000h -DB 000h,0e4h,00eh,000h -DB 000h,0d4h,00fh,000h -DB 000h,0c4h,010h,000h -DB 000h,034h,011h,000h -DB 000h,074h,014h,000h -DB 000h,064h,015h,000h -DB 000h,003h -DB 000h,001h,012h,000h +DB 1,0,25,133 +DB 000h,068h,000h,000h +DB 000h,078h,001h,000h +DB 000h,088h,002h,000h +DB 000h,098h,003h,000h +DB 000h,0f4h,00bh,000h +DB 000h,0e4h,00ch,000h +DB 000h,0d4h,00dh,000h +DB 000h,0c4h,00eh,000h +DB 000h,034h,00fh,000h +DB 000h,074h,012h,000h +DB 000h,064h,013h,000h +DB 000h,053h +DB 000h,0f2h DB 000h,050h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h $L$SEH_info_blst_sha256_block_data_order_epilogue:: -DB 1,0,5,11 +DB 1,0,4,0 DB 000h,074h,001h,000h DB 000h,064h,002h,000h -DB 000h,003h -DB 000h,000h +DB 000h,000h,000h,000h .xdata ENDS diff --git a/crypto/blst_src/bulk_addition.c b/crypto/blst_src/bulk_addition.c index 81afc530665..4d36f405b64 100644 --- a/crypto/blst_src/bulk_addition.c +++ b/crypto/blst_src/bulk_addition.c @@ -145,8 +145,7 @@ static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \ void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ size_t npoints) \ { \ - /* Performance with 288K scratch is within 1-2-3% from optimal */ \ - const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \ + const size_t stride = SCRATCH_LIMIT / sizeof(ptype); \ ptype *scratch = alloca((npoints > stride ? stride : npoints) * \ sizeof(ptype)); \ const ptype##_affine *point = NULL; \ @@ -163,6 +162,15 @@ void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ } \ } +#ifndef SCRATCH_LIMIT +# ifdef __wasm__ +# define SCRATCH_LIMIT (45 * 1024) +# else + /* Performance with 144K scratch is within 1-2-3% from optimal */ +# define SCRATCH_LIMIT (144 * 1024) +# endif +#endif + ADDITION_BTREE(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p2) ADDITION_BTREE(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) diff --git a/crypto/blst_src/bytes.h b/crypto/blst_src/bytes.h index af910ba8145..d81ffba5d46 100644 --- a/crypto/blst_src/bytes.h +++ b/crypto/blst_src/bytes.h @@ -26,7 +26,7 @@ static inline void limbs_from_be_bytes(limb_t *restrict ret, * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper * to perform redundant stores than to pay penalty for * mispredicted branch. Besides, some compilers unroll the - * loop and remove redundant stores to 'restict'-ed storage... + * loop and remove redundant stores to 'restrict'-ed storage... */ ret[n / sizeof(limb_t)] = limb; } @@ -55,7 +55,7 @@ static inline void limbs_from_le_bytes(limb_t *restrict ret, * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper * to perform redundant stores than to pay penalty for * mispredicted branch. Besides, some compilers unroll the - * loop and remove redundant stores to 'restict'-ed storage... + * loop and remove redundant stores to 'restrict'-ed storage... */ ret[n / sizeof(limb_t)] = limb; } diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c new file mode 100644 index 00000000000..0fcf563f502 --- /dev/null +++ b/crypto/blst_src/client_min_pk.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e2.c" +#include "hash_to_field.c" +#include "map_to_g2.c" +#include "e1.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c new file mode 100644 index 00000000000..8e4663daede --- /dev/null +++ b/crypto/blst_src/client_min_sig.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e1.c" +#include "hash_to_field.c" +#include "map_to_g1.c" +#include "e2.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/crypto/blst_src/cpuid.c b/crypto/blst_src/cpuid.c new file mode 100644 index 00000000000..43b9229d341 --- /dev/null +++ b/crypto/blst_src/cpuid.c @@ -0,0 +1,85 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#if (defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C)) && !defined(_WIN32) +__attribute__((visibility("hidden"))) +#endif +int __blst_platform_cap = 0; + +#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) + +# if defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C) +static void __cpuidex(int info[4], int func, int sub) +{ + int eax, ebx, ecx, edx; + + __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) + : "a"(func), "c"(sub)); + + info[0] = eax; + info[1] = ebx; + info[2] = ecx; + info[3] = edx; +} +# else +# include +# endif + +# if defined(__GNUC__) || defined(__clang__) +__attribute__((constructor)) +# endif +static int __blst_cpuid(void) +{ + int info[4], cap = 0; + + __cpuidex(info, 0, 0); + if (info[0] > 6) { + __cpuidex(info, 7, 0); + cap |= (info[1]>>19) & 1; /* ADX */ + cap |= (info[1]>>28) & 2; /* SHA */ + } + + __blst_platform_cap = cap; + + return 0; +} + +# if defined(_MSC_VER) && !defined(__clang__) +# pragma section(".CRT$XCU",read) +__declspec(allocate(".CRT$XCU")) static int (*p)(void) = __blst_cpuid; +# elif defined(__SUNPRO_C) +# pragma init(__blst_cpuid) +# endif + +#elif defined(__aarch64__) || defined(__aarch64) + +# if defined(__linux__) && (defined(__GNUC__) || defined(__clang__)) +extern unsigned long getauxval(unsigned long type) __attribute__ ((weak)); + +__attribute__((constructor)) +static int __blst_cpuid(void) +{ + int cap = 0; + + if (getauxval) { + unsigned long hwcap_ce = getauxval(16); + cap = (hwcap_ce>>6) & 1; /* SHA256 */ + } + + __blst_platform_cap = cap; + + return 0; +} +# elif defined(__APPLE__) && (defined(__GNUC__) || defined(__clang__)) +__attribute__((constructor)) +static int __blst_cpuid() +{ + __blst_platform_cap = 1; /* SHA256 */ + return 0; +} +# endif + +#endif diff --git a/crypto/blst_src/e1.c b/crypto/blst_src/e1.c index 91c4cdbf39c..f8a7be7bc14 100644 --- a/crypto/blst_src/e1.c +++ b/crypto/blst_src/e1.c @@ -155,7 +155,7 @@ void blst_p1_affine_serialize(unsigned char out[96], { if (vec_is_zero(in->X, 2*sizeof(in->X))) { bytes_zero(out, 96); - out[0] = 0x40; /* infinitiy bit */ + out[0] = 0x40; /* infinity bit */ } else { (void)POINTonE1_affine_Serialize_BE(out, in); } @@ -178,7 +178,7 @@ static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) { if (vec_is_zero(in->Z, sizeof(in->Z))) { bytes_zero(out, 96); - out[0] = 0x40; /* infinitiy bit */ + out[0] = 0x40; /* infinity bit */ } else { (void)POINTonE1_Serialize_BE(out, in); } @@ -202,7 +202,7 @@ void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) { if (vec_is_zero(in->X, 2*sizeof(in->X))) { bytes_zero(out, 48); - out[0] = 0xc0; /* compressed and infinitiy bits */ + out[0] = 0xc0; /* compressed and infinity bits */ } else { limb_t sign = POINTonE1_affine_Compress_BE(out, in); out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); @@ -226,7 +226,7 @@ void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) { if (vec_is_zero(in->Z, sizeof(in->Z))) { bytes_zero(out, 48); - out[0] = 0xc0; /* compressed and infinitiy bits */ + out[0] = 0xc0; /* compressed and infinity bits */ } else { limb_t sign = POINTonE1_Compress_BE(out, in); out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); diff --git a/crypto/blst_src/e2.c b/crypto/blst_src/e2.c index 822ac23c694..77f8064bce2 100644 --- a/crypto/blst_src/e2.c +++ b/crypto/blst_src/e2.c @@ -196,7 +196,7 @@ void blst_p2_affine_serialize(unsigned char out[192], { if (vec_is_zero(in->X, 2*sizeof(in->X))) { bytes_zero(out, 192); - out[0] = 0x40; /* infinitiy bit */ + out[0] = 0x40; /* infinity bit */ } else { (void)POINTonE2_affine_Serialize_BE(out, in); } @@ -219,7 +219,7 @@ static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) { if (vec_is_zero(in->Z, sizeof(in->Z))) { bytes_zero(out, 192); - out[0] = 0x40; /* infinitiy bit */ + out[0] = 0x40; /* infinity bit */ } else { (void)POINTonE2_Serialize_BE(out, in); } @@ -245,7 +245,7 @@ void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) { if (vec_is_zero(in->X, 2*sizeof(in->X))) { bytes_zero(out, 96); - out[0] = 0xc0; /* compressed and infinitiy bits */ + out[0] = 0xc0; /* compressed and infinity bits */ } else { limb_t sign = POINTonE2_affine_Compress_BE(out, in); out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); @@ -269,7 +269,7 @@ void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) { if (vec_is_zero(in->Z, sizeof(in->Z))) { bytes_zero(out, 96); - out[0] = 0xc0; /* compressed and infinitiy bits */ + out[0] = 0xc0; /* compressed and infinity bits */ } else { limb_t sign = POINTonE2_Compress_BE(out, in); out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); diff --git a/crypto/blst_src/ec_mult.h b/crypto/blst_src/ec_mult.h index 192f7337cbf..3c23489570c 100644 --- a/crypto/blst_src/ec_mult.h +++ b/crypto/blst_src/ec_mult.h @@ -46,9 +46,10 @@ static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) static limb_t booth_encode(limb_t wval, size_t sz) { limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ + launder(mask); wval = (wval + 1) >> 1; - wval = (wval & ~mask) | ((0-wval) & mask); + wval = (wval ^ mask) - mask; /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ return wval; @@ -61,7 +62,7 @@ static limb_t booth_encode(limb_t wval, size_t sz) * pass order's bit-length, which is customarily publicly known, instead * of the factual scalars' bit-lengths. This is facilitated by point * addition subroutines implemented to handle points at infinity, which - * are encoded as Z==0. [Doubling agorithms handle such points at + * are encoded as Z==0. [Doubling algorithms handle such points at * infinity "naturally," since resulting Z is product of original Z.] */ #define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ diff --git a/crypto/blst_src/exports.c b/crypto/blst_src/exports.c index ad720999883..1ca4d4757fa 100644 --- a/crypto/blst_src/exports.c +++ b/crypto/blst_src/exports.c @@ -19,7 +19,7 @@ #include "bytes.h" /* - * BLS12-381-specifc Fr shortcuts to assembly. + * BLS12-381-specific Fr shortcuts to assembly. */ void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) { add_mod_256(ret, a, b, BLS12_381_r); } @@ -39,6 +39,24 @@ void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) { mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } +void blst_fr_ct_bfly(vec256 x0, vec256 x1, const vec256 twiddle) +{ + vec256 x2; + + mul_mont_sparse_256(x2, x1, twiddle, BLS12_381_r, r0); + sub_mod_256(x1, x0, x2, BLS12_381_r); + add_mod_256(x0, x0, x2, BLS12_381_r); +} + +void blst_fr_gs_bfly(vec256 x0, vec256 x1, const vec256 twiddle) +{ + vec256 x2; + + sub_mod_256(x2, x0, x1, BLS12_381_r); + add_mod_256(x0, x0, x1, BLS12_381_r); + mul_mont_sparse_256(x1, x2, twiddle, BLS12_381_r, r0); +} + void blst_fr_sqr(vec256 ret, const vec256 a) { sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } @@ -102,27 +120,26 @@ int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b) int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b) { - vec256 a_fr, b_fr; + vec256 t[2]; const union { long one; char little; } is_endian = { 1 }; + bool_t is_zero; if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) { - limbs_from_le_bytes(a_fr, a, sizeof(a_fr)); - limbs_from_le_bytes(b_fr, b, sizeof(a_fr)); - a = (const byte *)a_fr; - b = (const byte *)b_fr; + limbs_from_le_bytes(t[0], a, sizeof(pow256)); + limbs_from_le_bytes(t[1], b, sizeof(pow256)); + a = (const byte *)t[0]; + b = (const byte *)t[1]; } - mul_mont_sparse_256(a_fr, (const limb_t *)a, BLS12_381_rRR, - BLS12_381_r, r0); - mul_mont_sparse_256(b_fr, (const limb_t *)b, BLS12_381_rRR, - BLS12_381_r, r0); - mul_mont_sparse_256(a_fr, a_fr, b_fr, BLS12_381_r, r0); - from_mont_256(a_fr, a_fr, BLS12_381_r, r0); - le_bytes_from_limbs(ret, a_fr, sizeof(a_fr)); - - return (int)(vec_is_zero(a_fr, sizeof(a_fr)) ^ 1); + mul_mont_sparse_256(t[0], BLS12_381_rRR, (const limb_t *)a, BLS12_381_r, r0); + mul_mont_sparse_256(t[0], t[0], (const limb_t *)b, BLS12_381_r, r0); + le_bytes_from_limbs(ret, t[0], sizeof(pow256)); + is_zero = vec_is_zero(t[0], sizeof(vec256)); + vec_zero(t, sizeof(t)); + + return (int)(is_zero^1); } void blst_sk_inverse(pow256 ret, const pow256 a) @@ -150,7 +167,7 @@ void blst_sk_inverse(pow256 ret, const pow256 a) } /* - * BLS12-381-specifc Fp shortcuts to assembly. + * BLS12-381-specific Fp shortcuts to assembly. */ void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) { add_fp(ret, a, b); } @@ -284,7 +301,7 @@ void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) } /* - * BLS12-381-specifc Fp2 shortcuts to assembly. + * BLS12-381-specific Fp2 shortcuts to assembly. */ void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) { add_fp2(ret, a, b); } @@ -311,7 +328,7 @@ void blst_fp2_cneg(vec384x ret, const vec384x a, int flag) { cneg_fp2(ret, a, is_zero(flag) ^ 1); } /* - * Scalar serialization/deseriazation + * Scalar serialization/deserialization. */ void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8]) { @@ -480,68 +497,75 @@ void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a) int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n) { - struct { vec256 out, digit, radix; } t; + size_t rem = (n - 1) % 32 + 1; + struct { vec256 out, digit; } t; limb_t ret; vec_zero(t.out, sizeof(t.out)); - vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix)); - while (n > 32) { - limbs_from_le_bytes(t.digit, bytes, 32); - from_mont_256(t.digit, t.digit, BLS12_381_r, r0); - mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + n -= rem; + limbs_from_le_bytes(t.out, bytes += n, rem); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + + while (n) { + limbs_from_le_bytes(t.digit, bytes -= 32, 32); add_mod_256(t.out, t.out, t.digit, BLS12_381_r); - mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0); - bytes += 32; + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); n -= 32; } - vec_zero(t.digit, sizeof(t.digit)); - limbs_from_le_bytes(t.digit, bytes, n); - from_mont_256(t.digit, t.digit, BLS12_381_r, r0); - mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); - add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + from_mont_256(t.out, t.out, BLS12_381_r, r0); ret = vec_is_zero(t.out, sizeof(t.out)); le_bytes_from_limbs(out, t.out, 32); - vec_zero(t.out, 2*sizeof(t.out)); + vec_zero(&t, sizeof(t)); return (int)(ret^1); } int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n) { - struct { vec256 out, digit, radix; } t; + size_t rem = (n - 1) % 32 + 1; + struct { vec256 out, digit; } t; limb_t ret; vec_zero(t.out, sizeof(t.out)); - vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix)); - bytes += n; - while (n > 32) { - limbs_from_be_bytes(t.digit, bytes -= 32, 32); - from_mont_256(t.digit, t.digit, BLS12_381_r, r0); - mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + limbs_from_be_bytes(t.out, bytes, rem); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + + while (n -= rem) { + limbs_from_be_bytes(t.digit, bytes += rem, 32); add_mod_256(t.out, t.out, t.digit, BLS12_381_r); - mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0); - n -= 32; + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + rem = 32; } - vec_zero(t.digit, sizeof(t.digit)); - limbs_from_be_bytes(t.digit, bytes -= n, n); - from_mont_256(t.digit, t.digit, BLS12_381_r, r0); - mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); - add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + from_mont_256(t.out, t.out, BLS12_381_r, r0); ret = vec_is_zero(t.out, sizeof(t.out)); le_bytes_from_limbs(out, t.out, 32); - vec_zero(t.out, 2*sizeof(t.out)); + vec_zero(&t, sizeof(t)); return (int)(ret^1); } /* - * Test facilitator + * Single-short SHA-256 hash function. + */ +#include "sha256.h" + +void blst_sha256(unsigned char md[32], const void *msg, size_t len) +{ + SHA256_CTX ctx; + + sha256_init(&ctx); + sha256_update(&ctx, msg, len); + sha256_final(md, &ctx); +} + +/* + * Test facilitator. */ void blst_scalar_from_hexascii(pow256 ret, const char *hex) { bytes_from_hexascii(ret, sizeof(pow256), hex); } diff --git a/crypto/blst_src/fields.h b/crypto/blst_src/fields.h index 515219f62dd..4b2323d2cce 100644 --- a/crypto/blst_src/fields.h +++ b/crypto/blst_src/fields.h @@ -10,7 +10,7 @@ #include "consts.h" /* - * BLS12-381-specifc Fp shortcuts to assembly. + * BLS12-381-specific Fp shortcuts to assembly. */ static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) { add_mod_384(ret, a, b, BLS12_381_P); } @@ -49,7 +49,7 @@ static inline void redc_fp(vec384 ret, const vec768 a) { redc_mont_384(ret, a, BLS12_381_P, p0); } /* - * BLS12-381-specifc Fp2 shortcuts to assembly. + * BLS12-381-specific Fp2 shortcuts to assembly. */ static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) { add_mod_384x(ret, a, b, BLS12_381_P); } diff --git a/crypto/blst_src/fp12_tower.c b/crypto/blst_src/fp12_tower.c index ab247a8ebf0..d6c0b124eb6 100644 --- a/crypto/blst_src/fp12_tower.c +++ b/crypto/blst_src/fp12_tower.c @@ -545,7 +545,7 @@ static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) mul_by_u_plus_1_fp2(c1, c1); mul_fp2(t0, a[0], a[1]); sub_fp2(c1, c1, t0); - + /* c2 = a1^2 - a0*a2 */ sqr_fp2(c2, a[1]); mul_fp2(t0, a[0], a[2]); @@ -733,7 +733,7 @@ static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) /* - * BLS12-381-specifc Fp12 shortcuts. + * BLS12-381-specific Fp12 shortcuts. */ void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) { sqr_fp12(ret, a); } diff --git a/crypto/blst_src/multi_scalar.c b/crypto/blst_src/multi_scalar.c index d0b3deefe25..55ab8227718 100644 --- a/crypto/blst_src/multi_scalar.c +++ b/crypto/blst_src/multi_scalar.c @@ -399,7 +399,20 @@ void prefix##s_mult_pippenger(ptype *ret, \ size_t npoints, \ const byte *const scalars[], size_t nbits, \ ptype##xyzz scratch[]) \ -{ ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); } +{ \ + if (npoints == 1) { \ + prefix##_from_affine(ret, points[0]); \ + prefix##_mult(ret, ret, scalars[0], nbits); \ + return; \ + } \ + if ((npoints * sizeof(ptype##_affine) * 8 * 3) <= SCRATCH_LIMIT) { \ + ptype##_affine *table = alloca(npoints * sizeof(ptype##_affine) * 8); \ + ptype##s_precompute_wbits(table, 4, points, npoints); \ + ptype##s_mult_wbits(ret, table, 4, npoints, scalars, nbits, NULL); \ + return; \ + } \ + ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); \ +} DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) diff --git a/crypto/blst_src/pairing.c b/crypto/blst_src/pairing.c index b256c44d68a..1396bbadd3b 100644 --- a/crypto/blst_src/pairing.c +++ b/crypto/blst_src/pairing.c @@ -409,6 +409,55 @@ void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1); } +#ifndef MILLER_LOOP_N_MAX +# define MILLER_LOOP_N_MAX 16 +#endif + +void blst_miller_loop_n(vec384fp12 out, const POINTonE2_affine *const Qs[], + const POINTonE1_affine *const Ps[], + size_t n) +{ /* ~10KB of stack storage */ + POINTonE2 T[MILLER_LOOP_N_MAX]; + POINTonE2_affine Q[MILLER_LOOP_N_MAX]; + POINTonE1_affine Px2[MILLER_LOOP_N_MAX]; + const POINTonE2_affine *Qptr = NULL; + const POINTonE1_affine *Pptr = NULL; + size_t i, j; + + for (i = 0, j = 0; j < n; j++) { + Qptr = *Qs ? *Qs++ : Qptr+1; + Pptr = *Ps ? *Ps++ : Pptr+1; + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, Pptr->X, Pptr->X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, Pptr->Y, Pptr->Y); + + vec_copy(Q[i].X, Qptr->X, 2*sizeof(Q[i].X)); + vec_copy(T[i].X, Qptr->X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + + if (++i == MILLER_LOOP_N_MAX || j == n-1) { + vec384fp12 tmp; + vec384fp6 *ret = j < MILLER_LOOP_N_MAX ? out : tmp; + + /* first step is ret = 1^2*line, which is just ret = line */ + start_dbl_n(ret, T, Px2, i); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, i, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, i, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, i, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, i, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, i, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ + + if (j >= MILLER_LOOP_N_MAX) + mul_fp12(out, out, ret); + + i = 0; + } + } +} + void blst_final_exp(vec384fp12 ret, const vec384fp12 f) { final_exp(ret, f); } diff --git a/crypto/blst_src/pentaroot.c b/crypto/blst_src/pentaroot.c index fd028113f3d..71f334df50a 100644 --- a/crypto/blst_src/pentaroot.c +++ b/crypto/blst_src/pentaroot.c @@ -6,10 +6,10 @@ #include "fields.h" -static inline void mul_fr(vec384 ret, const vec384 a, const vec384 b) +static inline void mul_fr(vec256 ret, const vec256 a, const vec256 b) { mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } -static inline void sqr_fr(vec384 ret, const vec384 a) +static inline void sqr_fr(vec256 ret, const vec256 a) { sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } #ifdef __OPTIMIZE_SIZE__ diff --git a/crypto/blst_src/vect.h b/crypto/blst_src/vect.h index 3211c8628cf..554dd5daefc 100644 --- a/crypto/blst_src/vect.h +++ b/crypto/blst_src/vect.h @@ -61,7 +61,7 @@ typedef unsigned char byte; typedef byte pow256[256/8]; /* - * Internal Boolean type, Bolean by value, hence safe to cast to or + * Internal Boolean type, Boolean by value, hence safe to cast to or * reinterpret as 'bool'. */ typedef limb_t bool_t; @@ -147,7 +147,6 @@ bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); # define mul_mont_384x mulx_mont_384x # define sqr_mont_384x sqrx_mont_384x # define sqr_mont_382x sqrx_mont_382x -# define sqr_n_mul_mont_384x sqrx_n_mul_mont_384x # define mul_382x mulx_382x # define sqr_382x sqrx_382x #endif @@ -156,8 +155,6 @@ void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, const vec384 p, limb_t n0); void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); -void sqr_n_mul_mont_384x(vec384x ret, const vec384x a, size_t count, - const vec384 p, limb_t n0, const vec384x b); void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); @@ -214,7 +211,7 @@ typedef const void *uptr_t; #endif #if defined(__GNUC__) || defined(__clang__) -# define launder(var) asm volatile("" : "+r"(var)) +# define launder(var) __asm__ __volatile__("" : "+r"(var)) #else # define launder(var) #endif @@ -249,9 +246,12 @@ static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, { limb_t ai, *ap = (limb_t *)a; limb_t bi, *bp = (limb_t *)b; - limb_t xorm, mask = (limb_t)0 - cbit; + limb_t xorm, mask; size_t i; + launder(cbit); + mask = (limb_t)0 - cbit; + num /= sizeof(limb_t); for (i = 0; i < num; i++) { @@ -377,7 +377,7 @@ static inline void vec_zero(void *ret, size_t num) rp[i] = 0; #if defined(__GNUC__) || defined(__clang__) - asm volatile("" : : "r"(ret) : "memory"); + __asm__ __volatile__("" : : "r"(ret) : "memory"); #endif } @@ -398,7 +398,7 @@ static inline void vec_zero(void *ret, size_t num) # pragma warning(disable: 4127 4189) #endif -#if !defined(__wasm__) +#if !defined(__wasm__) && __STDC_HOSTED__-0 != 0 # include #endif From 241798f34c764a3af499574986834420e58b45d6 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 7 Sep 2023 18:10:51 -0600 Subject: [PATCH 175/200] clean up assembly files include in cgo compilations --- crypto/blst_assembly.S | 117 +------------------------------ crypto/blst_src/README.md | 3 +- crypto/blst_src/build/assembly.S | 116 ++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 118 deletions(-) create mode 100644 crypto/blst_src/build/assembly.S diff --git a/crypto/blst_assembly.S b/crypto/blst_assembly.S index c0c5db30850..fb99b3d985e 100644 --- a/crypto/blst_assembly.S +++ b/crypto/blst_assembly.S @@ -1,116 +1 @@ -#if defined(__x86_64) || defined(__x86_64__) -# if defined(__ELF__) -# if defined(__BLST_PORTABLE__) -# include "elf/sha256-portable-x86_64.s" -# define blst_sha256_block_data_order blst_sha256_block_ssse3 -# endif -# include "elf/sha256-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "elf/ctx_inverse_mod_384-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "elf/ctq_inverse_mod_384-x86_64.s" -# endif -# include "elf/add_mod_384-x86_64.s" -# include "elf/add_mod_384x384-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "elf/mulx_mont_384-x86_64.s" -# include "elf/mulx_mont_256-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "elf/mulq_mont_384-x86_64.s" -# include "elf/mulq_mont_256-x86_64.s" -# endif -# include "elf/add_mod_256-x86_64.s" -# include "elf/ct_inverse_mod_256-x86_64.s" -# include "elf/div3w-x86_64.s" -# include "elf/ct_is_square_mod_384-x86_64.s" -# elif defined(_WIN64) || defined(__CYGWIN__) -# include "coff/sha256-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "coff/ctx_inverse_mod_384-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "coff/ctq_inverse_mod_384-x86_64.s" -# endif -# include "coff/add_mod_384-x86_64.s" -# include "coff/add_mod_384x384-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "coff/mulx_mont_384-x86_64.s" -# include "coff/mulx_mont_256-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "coff/mulq_mont_384-x86_64.s" -# include "coff/mulq_mont_256-x86_64.s" -# endif -# include "coff/add_mod_256-x86_64.s" -# include "coff/ct_inverse_mod_256-x86_64.s" -# include "coff/div3w-x86_64.s" -# include "coff/ct_is_square_mod_384-x86_64.s" -# elif defined(__APPLE__) -# include "mach-o/sha256-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "mach-o/ctx_inverse_mod_384-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "mach-o/ctq_inverse_mod_384-x86_64.s" -# endif -# include "mach-o/add_mod_384-x86_64.s" -# include "mach-o/add_mod_384x384-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "mach-o/mulx_mont_384-x86_64.s" -# include "mach-o/mulx_mont_256-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "mach-o/mulq_mont_384-x86_64.s" -# include "mach-o/mulq_mont_256-x86_64.s" -# endif -# include "mach-o/add_mod_256-x86_64.s" -# include "mach-o/ct_inverse_mod_256-x86_64.s" -# include "mach-o/div3w-x86_64.s" -# include "mach-o/ct_is_square_mod_384-x86_64.s" -# endif -#elif defined(__aarch64__) -# if defined(__ELF__) -# include "elf/sha256-armv8.S" -# include "elf/ct_inverse_mod_384-armv8.S" -# include "elf/add_mod_384-armv8.S" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# include "elf/mul_mont_384-armv8.S" -# include "elf/mul_mont_256-armv8.S" -# include "elf/add_mod_256-armv8.S" -# include "elf/ct_inverse_mod_256-armv8.S" -# include "elf/div3w-armv8.S" -# include "elf/ct_is_square_mod_384-armv8.S" -# elif defined(_WIN64) -# include "coff/sha256-armv8.S" -# include "coff/ct_inverse_mod_384-armv8.S" -# include "coff/add_mod_384-armv8.S" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# include "coff/mul_mont_384-armv8.S" -# include "coff/mul_mont_256-armv8.S" -# include "coff/add_mod_256-armv8.S" -# include "coff/ct_inverse_mod_256-armv8.S" -# include "coff/div3w-armv8.S" -# include "coff/ct_is_square_mod_384-armv8.S" -# elif defined(__APPLE__) -# include "mach-o/sha256-armv8.S" -# include "mach-o/ct_inverse_mod_384-armv8.S" -# include "mach-o/add_mod_384-armv8.S" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# include "mach-o/mul_mont_384-armv8.S" -# include "mach-o/mul_mont_256-armv8.S" -# include "mach-o/add_mod_256-armv8.S" -# include "mach-o/ct_inverse_mod_256-armv8.S" -# include "mach-o/div3w-armv8.S" -# include "mach-o/ct_is_square_mod_384-armv8.S" -# endif -#elif defined(__BLST_NO_ASM__) || \ - (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) -/* inaccurate way to detect a 32-bit processor, but it's close enough */ -#else -# error "unsupported platform" -#endif +# include "assembly.S" diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index ff63254bbe5..46715d13c2c 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -1,5 +1,5 @@ All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst, -specifically from the tagged version v0.3.11. +specifically from the tagged version `v0.3.11`. Copyright Supranational LLC Licensed under the Apache License, Version 2.0, see LICENSE for details. @@ -20,7 +20,6 @@ To upgrade the BLST version: - [ ] copy all `.c` and `.h` files from `/src/` into `./blst_src/`. - [ ] delete `./blst_src/server.c`. - [ ] copy the folder `/build/` into this folder `./blst_src`. -- [ ] move `./blst_src/build/assembly.S` to `./blst_assembly.S`. - [ ] update `./blst_src/blst_src.c` if needed. - [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `/bindings/go/blst.go`. - [ ] solve all breaking changes that may occur. diff --git a/crypto/blst_src/build/assembly.S b/crypto/blst_src/build/assembly.S new file mode 100644 index 00000000000..c0c5db30850 --- /dev/null +++ b/crypto/blst_src/build/assembly.S @@ -0,0 +1,116 @@ +#if defined(__x86_64) || defined(__x86_64__) +# if defined(__ELF__) +# if defined(__BLST_PORTABLE__) +# include "elf/sha256-portable-x86_64.s" +# define blst_sha256_block_data_order blst_sha256_block_ssse3 +# endif +# include "elf/sha256-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "elf/ctx_inverse_mod_384-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "elf/ctq_inverse_mod_384-x86_64.s" +# endif +# include "elf/add_mod_384-x86_64.s" +# include "elf/add_mod_384x384-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "elf/mulx_mont_384-x86_64.s" +# include "elf/mulx_mont_256-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "elf/mulq_mont_384-x86_64.s" +# include "elf/mulq_mont_256-x86_64.s" +# endif +# include "elf/add_mod_256-x86_64.s" +# include "elf/ct_inverse_mod_256-x86_64.s" +# include "elf/div3w-x86_64.s" +# include "elf/ct_is_square_mod_384-x86_64.s" +# elif defined(_WIN64) || defined(__CYGWIN__) +# include "coff/sha256-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "coff/ctx_inverse_mod_384-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "coff/ctq_inverse_mod_384-x86_64.s" +# endif +# include "coff/add_mod_384-x86_64.s" +# include "coff/add_mod_384x384-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "coff/mulx_mont_384-x86_64.s" +# include "coff/mulx_mont_256-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "coff/mulq_mont_384-x86_64.s" +# include "coff/mulq_mont_256-x86_64.s" +# endif +# include "coff/add_mod_256-x86_64.s" +# include "coff/ct_inverse_mod_256-x86_64.s" +# include "coff/div3w-x86_64.s" +# include "coff/ct_is_square_mod_384-x86_64.s" +# elif defined(__APPLE__) +# include "mach-o/sha256-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "mach-o/ctx_inverse_mod_384-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "mach-o/ctq_inverse_mod_384-x86_64.s" +# endif +# include "mach-o/add_mod_384-x86_64.s" +# include "mach-o/add_mod_384x384-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "mach-o/mulx_mont_384-x86_64.s" +# include "mach-o/mulx_mont_256-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "mach-o/mulq_mont_384-x86_64.s" +# include "mach-o/mulq_mont_256-x86_64.s" +# endif +# include "mach-o/add_mod_256-x86_64.s" +# include "mach-o/ct_inverse_mod_256-x86_64.s" +# include "mach-o/div3w-x86_64.s" +# include "mach-o/ct_is_square_mod_384-x86_64.s" +# endif +#elif defined(__aarch64__) +# if defined(__ELF__) +# include "elf/sha256-armv8.S" +# include "elf/ct_inverse_mod_384-armv8.S" +# include "elf/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "elf/mul_mont_384-armv8.S" +# include "elf/mul_mont_256-armv8.S" +# include "elf/add_mod_256-armv8.S" +# include "elf/ct_inverse_mod_256-armv8.S" +# include "elf/div3w-armv8.S" +# include "elf/ct_is_square_mod_384-armv8.S" +# elif defined(_WIN64) +# include "coff/sha256-armv8.S" +# include "coff/ct_inverse_mod_384-armv8.S" +# include "coff/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "coff/mul_mont_384-armv8.S" +# include "coff/mul_mont_256-armv8.S" +# include "coff/add_mod_256-armv8.S" +# include "coff/ct_inverse_mod_256-armv8.S" +# include "coff/div3w-armv8.S" +# include "coff/ct_is_square_mod_384-armv8.S" +# elif defined(__APPLE__) +# include "mach-o/sha256-armv8.S" +# include "mach-o/ct_inverse_mod_384-armv8.S" +# include "mach-o/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "mach-o/mul_mont_384-armv8.S" +# include "mach-o/mul_mont_256-armv8.S" +# include "mach-o/add_mod_256-armv8.S" +# include "mach-o/ct_inverse_mod_256-armv8.S" +# include "mach-o/div3w-armv8.S" +# include "mach-o/ct_is_square_mod_384-armv8.S" +# endif +#elif defined(__BLST_NO_ASM__) || \ + (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) +/* inaccurate way to detect a 32-bit processor, but it's close enough */ +#else +# error "unsupported platform" +#endif From 3bea523639703ae60e3d02d689444d66e0152517 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 7 Sep 2023 18:29:49 -0600 Subject: [PATCH 176/200] update internal/blst files --- crypto/blst_src/README.md | 9 ++- crypto/internal/blst/blst.go | 113 ++++++++++++++++++++++++++------ crypto/internal/blst/blst.h | 9 ++- crypto/internal/blst/blst_aux.h | 8 ++- 4 files changed, 111 insertions(+), 28 deletions(-) diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index 46715d13c2c..5f70311c6fd 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -15,14 +15,17 @@ The folder contains: - this `README` file. To upgrade the BLST version: +- [ ] audit all BLST updated, with focus on `/src`: https://github.com/supranational/blst/compare/v0.3.11... - [ ] delete all files in this folder `./blst_src/` but `blst_src.c` and `README.md`. +- [ ] delete all files in `./internal/blst/`. - [ ] open BLST repository on the new version. - [ ] copy all `.c` and `.h` files from `/src/` into `./blst_src/`. -- [ ] delete `./blst_src/server.c`. +- [ ] delete newly copied `./blst_src/server.c`. - [ ] copy the folder `/build/` into this folder `./blst_src`. -- [ ] update `./blst_src/blst_src.c` if needed. +- [ ] copy `/bindings/blst.h`, `/bindings/blst_aux.h`, and `/bindings/go/blst.go` into `./internal/blst/.`. - [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `/bindings/go/blst.go`. +- [ ] update `./blst_src/blst_src.c` if needed. - [ ] solve all breaking changes that may occur. - [ ] update the commit version on this `./blst_src/README`. -Remember that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should made along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. \ No newline at end of file +Note that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should done along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go index 037e40d98a3..c890f55e367 100644 --- a/crypto/internal/blst/blst.go +++ b/crypto/internal/blst/blst.go @@ -1,16 +1,8 @@ +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// DO NOT EDIT THIS FILE!! +// The file is generated from *.tgo by generate.py +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! /* - * This package is equivalent to the BLST Go package including all Go exported - * functions. BLST outer Go layer is used to cross-check flow-go/crypto BLS implementation. - * Note that flow-go/crypto uses BLST internal tools only to implement protocols based on BLS12-381, - * but does not use BLST outer layer and BLS implementation. - * Ideally, the cross-check tests would import github.com/supranational/blst. However this is - * not possible in Go as it causes multiple duplicated C objects. Creating the internal blst - * package is a workaround to achieve the same purpose. Note that the internal package - * implicitly uses the C objects declared by flow-go/crypto. - * - * Note: linter staticcheck was added in two spots to avoid linter false positives. - * - * Copied from https://github.com/supranational/blst. * Copyright Supranational LLC * Licensed under the Apache License, Version 2.0, see LICENSE for details. * SPDX-License-Identifier: Apache-2.0 @@ -18,7 +10,7 @@ package blst -// #cgo CFLAGS: -I${SRCDIR} -I${SRCDIR}/../../blst_src/build -I${SRCDIR}/../../blst_src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset +// #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset // #cgo amd64 CFLAGS: -D__ADX__ -mno-avx // #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "blst.h" @@ -132,6 +124,25 @@ package blst // blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len); // return blst_scalar_from_be_bytes(ret, elem, sizeof(elem)); // } +// static void go_miller_loop_n(blst_fp12 *dst, const blst_p2_affine Q[], +// const blst_p1_affine P[], +// size_t npoints, bool acc) +// { const blst_p2_affine *Qs[2] = { Q, NULL }; +// const blst_p1_affine *Ps[2] = { P, NULL }; +// if (acc) { +// blst_fp12 tmp; +// blst_miller_loop_n(&tmp, Qs, Ps, npoints); +// blst_fp12_mul(dst, dst, &tmp); +// } else { +// blst_miller_loop_n(dst, Qs, Ps, npoints); +// } +// } +// static void go_fp12slice_mul(blst_fp12 *dst, const blst_fp12 in[], size_t n) +// { size_t i; +// blst_fp12_mul(dst, &in[0], &in[1]); +// for (i = 2; i < n; i++) +// blst_fp12_mul(dst, dst, &in[i]); +// } import "C" import ( "fmt" @@ -358,6 +369,64 @@ func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 { return &pt } +func Fp12MillerLoopN(qs []P2Affine, ps []P1Affine) *Fp12 { + if len(qs) != len(ps) || len(qs) == 0 { + panic("inputs' lengths mismatch") + } + + nElems := uint32(len(qs)) + nThreads := uint32(maxProcs) + + if nThreads == 1 || nElems == 1 { + var pt Fp12 + C.go_miller_loop_n(&pt, &qs[0], &ps[0], C.size_t(nElems), false) + return &pt + } + + stride := (nElems + nThreads - 1) / nThreads + if stride > 16 { + stride = 16 + } + + strides := (nElems + stride - 1) / stride + if nThreads > strides { + nThreads = strides + } + + msgsCh := make(chan Fp12, nThreads) + curElem := uint32(0) + + for tid := uint32(0); tid < nThreads; tid++ { + go func() { + acc := Fp12One() + first := true + for { + work := atomic.AddUint32(&curElem, stride) - stride + if work >= nElems { + break + } + n := nElems - work + if n > stride { + n = stride + } + C.go_miller_loop_n(&acc, &qs[work], &ps[work], C.size_t(n), + C.bool(!first)) + first = false + } + msgsCh <- acc + }() + } + + var ret = make([]Fp12, nThreads) + for i := range ret { + ret[i] = <-msgsCh + } + + var pt Fp12 + C.go_fp12slice_mul(&pt, &ret[0], C.size_t(nThreads)) + return &pt +} + func (pt *Fp12) MulAssign(p *Fp12) { C.blst_fp12_mul(pt, pt, p) } @@ -376,6 +445,10 @@ func (pt *Fp12) ToBendian() []byte { return out[:] } +func (pt1 *Fp12) Equals(pt2 *Fp12) bool { + return *pt1 == *pt2 +} + // // MIN-PK // @@ -399,8 +472,10 @@ func (pk *P1Affine) KeyValidate() bool { // always cryptographically safe, but application might want // to guard against obviously bogus individual[!] signatures. func (sig *P2Affine) SigValidate(sigInfcheck bool) bool { - return (sigInfcheck && !bool(C.blst_p2_affine_is_inf(sig))) || - bool(C.blst_p2_affine_in_g2(sig)) + if sigInfcheck && bool(C.blst_p2_affine_is_inf(sig)) { + return false + } + return bool(C.blst_p2_affine_in_g2(sig)) } // @@ -589,7 +664,6 @@ func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, // main thread has completed its miller loop before // proceeding. mutex.Lock() - //nolint:staticcheck mutex.Unlock() } @@ -1018,8 +1092,10 @@ func (pk *P2Affine) KeyValidate() bool { // always cryptographically safe, but application might want // to guard against obviously bogus individual[!] signatures. func (sig *P1Affine) SigValidate(sigInfcheck bool) bool { - return (sigInfcheck && !bool(C.blst_p1_affine_is_inf(sig))) || - bool(C.blst_p1_affine_in_g1(sig)) + if sigInfcheck && bool(C.blst_p1_affine_is_inf(sig)) { + return false + } + return bool(C.blst_p1_affine_in_g1(sig)) } // @@ -1208,7 +1284,6 @@ func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool, // main thread has completed its miller loop before // proceeding. mutex.Lock() - //nolint:staticcheck mutex.Unlock() } diff --git a/crypto/internal/blst/blst.h b/crypto/internal/blst/blst.h index 2e314b3a32e..1349896a3f8 100644 --- a/crypto/internal/blst/blst.h +++ b/crypto/internal/blst/blst.h @@ -95,10 +95,6 @@ void blst_fr_sqr(blst_fr *ret, const blst_fr *a); void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); void blst_fr_inverse(blst_fr *ret, const blst_fr *a); -#ifdef BLST_FR_PENTAROOT -void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); -void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); -#endif void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); @@ -341,6 +337,9 @@ void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, #ifndef SWIG void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, const blst_p1_affine *P); +void blst_miller_loop_n(blst_fp12 *ret, const blst_p2_affine *const Qs[], + const blst_p1_affine *const Ps[], + size_t n); void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], @@ -480,4 +479,4 @@ extern const blst_p2_affine BLS12_381_NEG_G2; #ifdef __cplusplus } #endif -#endif \ No newline at end of file +#endif diff --git a/crypto/internal/blst/blst_aux.h b/crypto/internal/blst/blst_aux.h index d96b1f3dd3b..3de0850e330 100644 --- a/crypto/internal/blst/blst_aux.h +++ b/crypto/internal/blst/blst_aux.h @@ -10,8 +10,14 @@ * depending on their proven/unproven worthiness. */ +void blst_fr_ct_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); +void blst_fr_gs_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); void blst_fr_to(blst_fr *ret, const blst_fr *a); void blst_fr_from(blst_fr *ret, const blst_fr *a); +#ifdef BLST_FR_PENTAROOT +void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); +void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); +#endif void blst_fp_to(blst_fp *ret, const blst_fp *a); void blst_fp_from(blst_fp *ret, const blst_fp *a); @@ -108,4 +114,4 @@ size_t blst_fp12_sizeof(void); * Single-shot SHA-256 hash function. */ void blst_sha256(byte out[32], const byte *msg, size_t msg_len); -#endif \ No newline at end of file +#endif From c292bc44babd8850e79c0cc7291eba4fc0e93fa6 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 7 Sep 2023 18:33:10 -0600 Subject: [PATCH 177/200] fix a readme typo --- crypto/blst_src/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index 5f70311c6fd..50ca45ea7d6 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -15,7 +15,7 @@ The folder contains: - this `README` file. To upgrade the BLST version: -- [ ] audit all BLST updated, with focus on `/src`: https://github.com/supranational/blst/compare/v0.3.11... +- [ ] audit all BLST updates, with focus on `/src`: https://github.com/supranational/blst/compare/v0.3.11... - [ ] delete all files in this folder `./blst_src/` but `blst_src.c` and `README.md`. - [ ] delete all files in `./internal/blst/`. - [ ] open BLST repository on the new version. From 0d09d5517e03417e6fee79e11f5a32ae8e9ea892 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 8 Sep 2023 19:22:50 -0600 Subject: [PATCH 178/200] tmp tmate to debug --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b24de2f44ca..db68ffe199c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,6 +111,8 @@ jobs: cache: true - name: Setup tests (${{ matrix.targets.name }}) run: VERBOSE=1 make -e GO_TEST_PACKAGES="${{ matrix.targets.packages }}" install-tools + - name: Setup tmate session + uses: mxschmitt/action-tmate@v3 - name: Run tests (${{ matrix.targets.name }}) uses: nick-fields/retry@v2 with: From dce50f91c09f81efe78776269c0ee99355c95b55 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 12 Sep 2023 13:49:58 -0600 Subject: [PATCH 179/200] make start up node time larger to accommodate failing TestClusterSwitchover_MultiCluster --- .github/workflows/ci.yml | 2 -- engine/collection/test/cluster_switchover_test.go | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db68ffe199c..b24de2f44ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,8 +111,6 @@ jobs: cache: true - name: Setup tests (${{ matrix.targets.name }}) run: VERBOSE=1 make -e GO_TEST_PACKAGES="${{ matrix.targets.packages }}" install-tools - - name: Setup tmate session - uses: mxschmitt/action-tmate@v3 - name: Run tests (${{ matrix.targets.name }}) uses: nick-fields/retry@v2 with: diff --git a/engine/collection/test/cluster_switchover_test.go b/engine/collection/test/cluster_switchover_test.go index a8f04173099..15a23823ab3 100644 --- a/engine/collection/test/cluster_switchover_test.go +++ b/engine/collection/test/cluster_switchover_test.go @@ -212,7 +212,7 @@ func (tc *ClusterSwitchoverTestCase) StartNodes() { nodes = append(nodes, node) } - unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), time.Second, "could not start nodes") + unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), 3*time.Second, "could not start nodes") // start continuous delivery for all nodes for _, node := range tc.nodes { From ba78ef6dc9d9752bf58ddeba450b1a511edb32f9 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 20 Sep 2023 19:50:32 -0600 Subject: [PATCH 180/200] makefile typo --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b471e25ee09..874d56f8d72 100644 --- a/Makefile +++ b/Makefile @@ -222,7 +222,7 @@ generate-mocks: install-mock-generators tidy: go mod tidy -v cd integration; go mod tidy -v - cd crypo; go mod tidy -v + cd crypto; go mod tidy -v cd cmd/testclient; go mod tidy -v cd insecure; go mod tidy -v git diff --exit-code From 8ff9b79d805f64914d39d6af71d300e9c6098ebe Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 20 Sep 2023 19:50:59 -0600 Subject: [PATCH 181/200] clean up C bls12_381 utils --- crypto/bls12381_utils.c | 287 +++++++++++++++++++++------------------- crypto/blst_include.h | 12 +- 2 files changed, 154 insertions(+), 145 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index e4636aad457..9f168e0b3e0 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -11,15 +11,18 @@ // make sure flow crypto types are consistent with BLST types void types_sanity(void) { + assert(sizeof(vec256) == sizeof(Fr)); assert(sizeof(Fp) == sizeof(vec384)); + assert(sizeof(vec384x) == sizeof(Fp2)); assert(sizeof(E1) == sizeof(POINTonE1)); assert(sizeof(E2) == sizeof(POINTonE2)); + assert(sizeof(vec384fp12) == sizeof(Fp12)); } - + // ------------------- Fr utilities // Montgomery constant R related to the curve order r -// R mod r = (1<<256)%r +// R = (1<<256) mod r const Fr BLS12_381_rR = {{ TO_LIMB_T(0x1824b159acc5056f), TO_LIMB_T(0x998c4fefecbc4ff5), @@ -27,7 +30,7 @@ const Fr BLS12_381_rR = {{ TO_LIMB_T(0x00000001fffffffe), }}; -// returns true if a == 0 and false otherwise +// returns true if a is zero and false otherwise bool Fr_is_zero(const Fr *a) { return vec_is_zero(a, sizeof(Fr)); } // returns true if a == b and false otherwise @@ -75,8 +78,8 @@ void Fr_squ_montg(Fr *res, const Fr *a) { // res = a*R void Fr_to_montg(Fr *res, const Fr *a) { - mul_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_rRR, BLS12_381_r, - r0); + mul_mont_sparse_256((limb_t *)res, (limb_t *)a, + BLS12_381_rRR, BLS12_381_r, r0); } // res = a*R^(-1) @@ -101,9 +104,9 @@ void Fr_inv_montg_eucl(Fr *res, const Fr *a) { } // computes the sum of the array elements and writes the sum in jointx -void Fr_sum_vector(Fr *jointx, const Fr x[], const int len) { +void Fr_sum_vector(Fr *jointx, const Fr x[], const int x_len) { Fr_set_zero(jointx); - for (int i = 0; i < len; i++) { + for (int i = 0; i < x_len; i++) { Fr_add(jointx, jointx, &x[i]); } } @@ -118,10 +121,10 @@ static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES]) { *(ret++) = *b; *(b--) = tmp; } - return; - } - for (int i = 0; i < Fr_BYTES; i++) { - *(ret++) = *(b--); + } else { + for (int i = 0; i < Fr_BYTES; i++) { + *(ret++) = *(b--); + } } } @@ -136,19 +139,19 @@ static void pow256_from_Fr(pow256 ret, const Fr *in) { // - BAD_ENCODING if the length is invalid // - BAD_VALUE if the scalar isn't in Fr // - VALID if the scalar is valid -ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) { - if (len != Fr_BYTES) { +ERROR Fr_read_bytes(Fr *a, const byte *in, int in_len) { + if (in_len != Fr_BYTES) { return BAD_ENCODING; } - // compare to r using the BLST tool + // compare to r using BLST internal function pow256 tmp; - pow256_from_be_bytes(tmp, bin); + pow256_from_be_bytes(tmp, in); // (check_mod_256 compares pow256 against a vec256!) if (!check_mod_256(tmp, BLS12_381_r)) { return BAD_VALUE; } vec_zero(tmp, sizeof(tmp)); - limbs_from_be_bytes((limb_t *)a, bin, Fr_BYTES); + limbs_from_be_bytes((limb_t *)a, in, Fr_BYTES); return VALID; } @@ -158,8 +161,8 @@ ERROR Fr_read_bytes(Fr *a, const byte *bin, int len) { // - BAD_ENCODING if the length is invalid // - BAD_VALUE if the scalar isn't in Fr_star // - VALID if the scalar is valid -ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len) { - int ret = Fr_read_bytes(a, bin, len); +ERROR Fr_star_read_bytes(Fr *a, const byte *in, int in_len) { + int ret = Fr_read_bytes(a, in, in_len); if (ret != VALID) { return ret; } @@ -171,9 +174,9 @@ ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len) { } // write Fr element `a` in big endian bytes. -void Fr_write_bytes(byte *bin, const Fr *a) { +void Fr_write_bytes(byte *out, const Fr *a) { // be_bytes_from_limbs works for both limb endianness types - be_bytes_from_limbs(bin, (limb_t *)a, Fr_BYTES); + be_bytes_from_limbs(out, (limb_t *)a, Fr_BYTES); } // maps big-endian bytes of any size into an Fr element using modular reduction. @@ -181,7 +184,7 @@ void Fr_write_bytes(byte *bin, const Fr *a) { // // Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p, // limb_t n0) to reduce 512 bits at a time. -static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) { +static void Fr_from_be_bytes(Fr *out, const byte *in, const int in_len) { // input can be written in base 2^|R|, with R the Montgomery constant // N = l_1 + L_2*2^|R| .. + L_n*2^(|R|*(n-1)) // Therefore N mod p can be expressed using R as: @@ -190,7 +193,8 @@ static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) { Fr_set_zero(out); Fr_copy(&radix, (Fr *)BLS12_381_rRR); // R^2 - byte *p = (byte *)bytes + n; + int n = in_len; + byte *p = (byte *)in + in_len; while (n > Fr_BYTES) { // limbs_from_be_bytes works for both limb endiannesses limbs_from_be_bytes((limb_t *)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i @@ -214,8 +218,8 @@ static void Fr_from_be_bytes(Fr *out, const byte *bytes, size_t n) { // Reads a scalar from an array and maps it to Fr using modular reduction. // Input is byte-big-endian as used by the external APIs. // It returns true if scalar is zero and false otherwise. -bool map_bytes_to_Fr(Fr *a, const byte *bin, int len) { - Fr_from_be_bytes(a, bin, len); +bool map_bytes_to_Fr(Fr *a, const byte *in, int in_len) { + Fr_from_be_bytes(a, in, in_len); return Fr_is_zero(a); } @@ -262,15 +266,15 @@ static bool Fp_sqrt_montg(Fp *res, const Fp *a) { return sqrt_fp((limb_t *)res, (limb_t *)a); } -static bool Fp_check(const Fp *in) { +static bool Fp_check(const Fp *a) { // use same method as in BLST internal function // which seems the most efficient. The method uses the assembly-based // modular addition instead of limbs comparison Fp temp; - Fp_add(&temp, in, &ZERO_384); - return vec_is_equal(&temp, in, Fp_BYTES); - // no need to clear `tmp` as no use-case involves sensitive data being passed - // as `in` + Fp_add(&temp, a, &ZERO_384); + return vec_is_equal(&temp, a, Fp_BYTES); + // no need to clear `tmp` as no current use-case involves sensitive data being passed + // as `a` } // res = a*b*R^(-1) @@ -293,36 +297,36 @@ void Fp_from_montg(Fp *res, const Fp *a) { from_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0); } -// reads a scalar in `a` and checks it is a valid Fp element (a < p). +// reads a scalar in `out` and checks it is a valid Fp element (out < p). // input is bytes-big-endian. // returns: // - BAD_ENCODING if the length is invalid // - BAD_VALUE if the scalar isn't in Fp // - VALID if the scalar is valid -ERROR Fp_read_bytes(Fp *a, const byte *bin, int len) { - if (len != Fp_BYTES) { +ERROR Fp_read_bytes(Fp *out, const byte *in, int in_len) { + if (in_len != Fp_BYTES) { return BAD_ENCODING; } - limbs_from_be_bytes((limb_t *)a, bin, Fp_BYTES); + limbs_from_be_bytes((limb_t *)out, in, Fp_BYTES); // compare read scalar to p - if (!Fp_check(a)) { + if (!Fp_check(out)) { return BAD_VALUE; } return VALID; } -// write Fp element to `bin`, -// assuming `bin` has `Fp_BYTES` allocated bytes. -void Fp_write_bytes(byte *bin, const Fp *a) { - be_bytes_from_limbs(bin, (limb_t *)a, Fp_BYTES); +// write Fp element to `out`, +// assuming `out` has `Fp_BYTES` allocated bytes. +void Fp_write_bytes(byte *out, const Fp *a) { + be_bytes_from_limbs(out, (limb_t *)a, Fp_BYTES); } -// returns the sign of y. +// returns the sign of y: // 1 if y > (p - 1)/2 and 0 otherwise. -// y is in montgomery form +// y is in montgomery form! static byte Fp_get_sign(const Fp *y) { - // BLST's sgn0_pty_mont_384 requires input to be in Montg form. - // The needed sign bit is on position 1 ! + // - BLST's sgn0_pty_mont_384 requires input to be in Montg form. + // - The needed sign bit is on position 1 return (sgn0_pty_mont_384((const limb_t *)y, BLS12_381_P, p0) >> 1) & 1; } @@ -361,18 +365,19 @@ static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { // the square root in `res`. // // The boolean output is valid whether `a` is in Montgomery form or not, -// since montgomery constant `R` is a quadratic residue. -// However, the square root is valid only if `a` is in montgomery form. +// since montgomery constant `R` is itself a quadratic residue. +// However, the square root is correct only if `a` is in montgomery form +// (the square root would be in montgomery form too). static bool Fp2_sqrt_montg(Fp2 *res, const Fp2 *a) { return sqrt_fp2((vec384 *)res, (vec384 *)a); } -// returns the sign of y. -// sign(y_0) if y_1 = 0, else sign(y_1) -// y coordinates must be in montgomery form +// returns the sign of y: +// sign(y_0) if y_1 = 0, else sign(y_1). +// y coordinates must be in montgomery form! static byte Fp2_get_sign(Fp2 *y) { - // BLST's sgn0_pty_mont_384x requires input to be in Montg form. - // The needed sign bit is on position 1 ! + // - BLST's sgn0_pty_mont_384x requires input to be in montgomery form. + // - the sign bit is on position 1 return (sgn0_pty_mont_384x((vec384 *)y, BLS12_381_P, p0) >> 1) & 1; } @@ -383,15 +388,15 @@ static byte Fp2_get_sign(Fp2 *y) { // - BAD_ENCODING if the length is invalid // - BAD_VALUE if the scalar isn't in Fp // - VALID if the scalar is valid -static ERROR Fp2_read_bytes(Fp2 *a, const byte *bin, int len) { - if (len != Fp2_BYTES) { +static ERROR Fp2_read_bytes(Fp2 *a, const byte *in, int in_len) { + if (in_len != Fp2_BYTES) { return BAD_ENCODING; } - ERROR ret = Fp_read_bytes(&real(a), bin, Fp_BYTES); + ERROR ret = Fp_read_bytes(&real(a), in, Fp_BYTES); if (ret != VALID) { return ret; } - ret = Fp_read_bytes(&imag(a), bin + Fp_BYTES, Fp_BYTES); + ret = Fp_read_bytes(&imag(a), in + Fp_BYTES, Fp_BYTES); if (ret != VALID) { return ret; } @@ -399,9 +404,9 @@ static ERROR Fp2_read_bytes(Fp2 *a, const byte *bin, int len) { } // write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes. -void Fp2_write_bytes(byte *bin, const Fp2 *a) { - Fp_write_bytes(bin, &real(a)); - Fp_write_bytes(bin + Fp_BYTES, &imag(a)); +void Fp2_write_bytes(byte *out, const Fp2 *a) { + Fp_write_bytes(out, &real(a)); + Fp_write_bytes(out + Fp_BYTES, &imag(a)); } // ------------------- E1 utilities @@ -419,13 +424,13 @@ bool E1_is_equal(const E1 *p1, const E1 *p2) { return POINTonE1_is_equal((const POINTonE1 *)p1, (const POINTonE1 *)p2); } -// compare p to infinity +// compare `p` to infinity bool E1_is_infty(const E1 *p) { // BLST infinity points are defined by Z=0 return vec_is_zero(p->z, sizeof(p->z)); } -// set p to infinity +// set `p` to infinity void E1_set_infty(E1 *p) { // BLST infinity points are defined by Z=0 vec_zero(p->z, sizeof(p->z)); @@ -444,7 +449,7 @@ void E1_to_affine(E1 *res, const E1 *p) { // checks affine point `p` is in E1 bool E1_affine_on_curve(const E1 *p) { - // BLST's `POINTonE1_affine_on_curve` does not include the inifity case! + // BLST's `POINTonE1_affine_on_curve` does not include the infinity case! return POINTonE1_affine_on_curve((POINTonE1_affine *)p) | E1_is_infty(p); } @@ -452,6 +457,7 @@ bool E1_affine_on_curve(const E1 *p) { // It assumes input `p` is on E1. bool E1_in_G1(const E1 *p) { // currently uses Scott method + // TODO: compare to clearing the cofactor using u-1 return POINTonE1_in_G1((const POINTonE1 *)p); } @@ -469,27 +475,27 @@ bool E1_in_G1(const E1 *p) { // Note: could use POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, // but needs to update the logic around G2 subgroup check -ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { +ERROR E1_read_bytes(E1 *a, const byte *in, const int in_len) { // check the length - if (len != G1_SER_BYTES) { + if (in_len != G1_SER_BYTES) { return BAD_ENCODING; } // check the compression bit - int compressed = bin[0] >> 7; + int compressed = in[0] >> 7; if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { return BAD_ENCODING; } // check if the point in infinity - int is_infinity = bin[0] & 0x40; + int is_infinity = in[0] & 0x40; if (is_infinity) { // the remaining bits need to be cleared - if (bin[0] & 0x3F) { + if (in[0] & 0x3F) { return BAD_ENCODING; } for (int i = 1; i < G1_SER_BYTES - 1; i++) { - if (bin[i]) { + if (in[i]) { return BAD_ENCODING; } } @@ -498,14 +504,14 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { } // read the sign bit and check for consistency - int y_sign = (bin[0] >> 5) & 1; + int y_sign = (in[0] >> 5) & 1; if (y_sign && (!compressed)) { return BAD_ENCODING; } // use a temporary buffer to mask the header bits and read a.x byte temp[Fp_BYTES]; - memcpy(temp, bin, Fp_BYTES); + memcpy(temp, in, Fp_BYTES); temp[0] &= 0x1F; // clear the header bits ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp)); if (ret != VALID) { @@ -517,7 +523,7 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { Fp_copy(&a->z, &BLS12_381_pR); if (G1_SERIALIZATION == UNCOMPRESSED) { - ret = Fp_read_bytes(&a->y, bin + Fp_BYTES, sizeof(a->y)); + ret = Fp_read_bytes(&a->y, in + Fp_BYTES, sizeof(a->y)); if (ret != VALID) { return ret; } @@ -532,13 +538,13 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { // compute the possible square root Fp_squ_montg(&a->y, &a->x); Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 - Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in Montg form + Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in montg form // check whether x^3+b is a quadratic residue if (!Fp_sqrt_montg(&a->y, &a->y)) { return POINT_NOT_ON_CURVE; } - // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) + // resulting (x,y) is guaranteed to be on curve (y is already in montg form) if (Fp_get_sign(&a->y) != y_sign) { Fp_neg(&a->y, &a->y); // flip y sign if needed } @@ -549,27 +555,27 @@ ERROR E1_read_bytes(E1 *a, const byte *bin, const int len) { // uncompressed form. It assumes buffer is of length G1_SER_BYTES The // serialization follows: // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -void E1_write_bytes(byte *bin, const E1 *a) { +void E1_write_bytes(byte *out, const E1 *a) { if (E1_is_infty(a)) { // set the infinity bit - bin[0] = (G1_SERIALIZATION << 7) | (1 << 6); - memset(bin + 1, 0, G1_SER_BYTES - 1); + out[0] = (G1_SERIALIZATION << 7) | (1 << 6); + memset(out + 1, 0, G1_SER_BYTES - 1); return; } E1 tmp; E1_to_affine(&tmp, a); Fp_from_montg(&tmp.x, &tmp.x); - Fp_write_bytes(bin, &tmp.x); + Fp_write_bytes(out, &tmp.x); if (G1_SERIALIZATION == COMPRESSED) { - bin[0] |= (Fp_get_sign(&tmp.y) << 5); + out[0] |= (Fp_get_sign(&tmp.y) << 5); } else { Fp_from_montg(&tmp.y, &tmp.y); - Fp_write_bytes(bin + Fp_BYTES, &tmp.y); + Fp_write_bytes(out + Fp_BYTES, &tmp.y); } // compression bit - bin[0] |= (G1_SERIALIZATION << 7); + out[0] |= (G1_SERIALIZATION << 7); } // generic point addition that must handle doubling and points at infinity @@ -599,28 +605,29 @@ void E1_sum_vector(E1 *sum, const E1 *y, const int len) { } } -// Computes the sum of input signatures (E1 elements) flattened in a single byte -// array `sigs_bytes` of `sigs_len` bytes. and writes the sum (E1 element) as -// bytes in `dest`. The function does not check membership of E1 inputs in G1 +// Computes the sum of input E1 elements flattened in a single byte +// array `in_bytes` of `in_len` bytes. and writes the sum (E1 element) as +// bytes in `out`. +// The function does not check membership of E1 inputs in G1 // subgroup. The header is using byte pointers to minimize Cgo calls from the Go // layer. -int E1_sum_vector_byte(byte *dest, const byte *sigs_bytes, const int sigs_len) { +int E1_sum_vector_byte(byte *out, const byte *in_bytes, const int in_len) { int error = UNDEFINED; // sanity check that `len` is multiple of `G1_SER_BYTES` - if (sigs_len % G1_SER_BYTES) { + if (in_len % G1_SER_BYTES) { error = INVALID; goto mem_error; } - int n = sigs_len / G1_SER_BYTES; // number of signatures + int n = in_len / G1_SER_BYTES; // number of signatures - E1 *sigs = (E1 *)malloc(n * sizeof(E1)); - if (!sigs) + E1 *vec = (E1 *)malloc(n * sizeof(E1)); + if (!vec) goto mem_error; // import the points from the array for (int i = 0; i < n; i++) { // deserialize each point from the input array - if (E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES * i], G1_SER_BYTES) != + if (E1_read_bytes(&vec[i], &in_bytes[G1_SER_BYTES * i], G1_SER_BYTES) != VALID) { error = INVALID; goto out; @@ -628,12 +635,12 @@ int E1_sum_vector_byte(byte *dest, const byte *sigs_bytes, const int sigs_len) { } // sum the points E1 acc; - E1_sum_vector(&acc, sigs, n); + E1_sum_vector(&acc, vec, n); // export the result - E1_write_bytes(dest, &acc); + E1_write_bytes(out, &acc); error = VALID; out: - free(sigs); + free(vec); mem_error: return error; } @@ -648,13 +655,13 @@ void G1_mult_gen(E1 *res, const Fr *expo) { // Reads a scalar bytes and maps it to Fp using modular reduction. // output is in Montgomery form. -// `len` must be less or equal to 96 bytes and must be a multiple of 8. +// `in_len` must be less or equal to 96 bytes and must be a multiple of 8. // This function is only used by `map_to_G1` where input is 64 bytes. -// input `len` is not checked to satisfy the conditions above. -static void map_96_bytes_to_Fp(Fp *a, const byte *bin, int len) { +// input `in_len` is not checked to satisfy the conditions above. +static void map_96_bytes_to_Fp(Fp *a, const byte *in, int in_len) { vec768 tmp; vec_zero(&tmp, sizeof(tmp)); - limbs_from_be_bytes((limb_t *)tmp, bin, len); + limbs_from_be_bytes((limb_t *)tmp, in, in_len); redc_mont_384((limb_t *)a, tmp, BLS12_381_P, p0); // aR^(-2) Fp_mul_montg(a, a, (Fp *)BLS12_381_RRRR); // aR } @@ -662,16 +669,16 @@ static void map_96_bytes_to_Fp(Fp *a, const byte *bin, int len) { // maps bytes input `hash` to G1. // `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes) // It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf -int map_to_G1(E1 *h, const byte *hash, const int len) { +int map_to_G1(E1 *h, const byte *hash, const int hash_len) { // sanity check of length - if (len != MAP_TO_G1_INPUT_LEN) { + if (hash_len != MAP_TO_G1_INPUT_LEN) { return INVALID; } // map to field elements Fp u[2]; - map_96_bytes_to_Fp(&u[0], hash, MAP_TO_G1_INPUT_LEN / 2); - map_96_bytes_to_Fp(&u[1], hash + MAP_TO_G1_INPUT_LEN / 2, - MAP_TO_G1_INPUT_LEN / 2); + const int half = MAP_TO_G1_INPUT_LEN / 2; + map_96_bytes_to_Fp(&u[0], hash, half); + map_96_bytes_to_Fp(&u[1], hash + half, half); // map field elements to G1 // inputs must be in Montgomery form map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]); @@ -692,11 +699,11 @@ void unsafe_map_bytes_to_G1(E1 *p, const byte *bytes, int len) { // maps bytes to a point in E1\G1. // `len` must be at least 96 bytes. -// this is a testing file only, should not be used in any protocol! -void unsafe_map_bytes_to_G1complement(E1 *p, const byte *bytes, int len) { - assert(len >= 96); +// this is a testing function only, should not be used in any protocol! +void unsafe_map_bytes_to_G1complement(E1 *p, const byte *in, int in_len) { + assert(in_len >= 96); Fp u; - map_96_bytes_to_Fp(&u, bytes, 96); + map_96_bytes_to_Fp(&u, in, 96); // map to E1's isogenous and then to E1 map_to_isogenous_E1((POINTonE1 *)p, u); isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p); @@ -724,27 +731,27 @@ const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2; // // Note: can use with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, // and update the logic around G2 subgroup check. -ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) { +ERROR E2_read_bytes(E2 *a, const byte *in, const int in_len) { // check the length - if (len != G2_SER_BYTES) { + if (in_len != G2_SER_BYTES) { return BAD_ENCODING; } // check the compression bit - int compressed = bin[0] >> 7; + int compressed = in[0] >> 7; if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { return BAD_ENCODING; } // check if the point in infinity - int is_infinity = bin[0] & 0x40; + int is_infinity = in[0] & 0x40; if (is_infinity) { // the remaining bits need to be cleared - if (bin[0] & 0x3F) { + if (in[0] & 0x3F) { return BAD_ENCODING; } for (int i = 1; i < G2_SER_BYTES - 1; i++) { - if (bin[i]) { + if (in[i]) { return BAD_ENCODING; } } @@ -753,14 +760,14 @@ ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) { } // read the sign bit and check for consistency - int y_sign = (bin[0] >> 5) & 1; + int y_sign = (in[0] >> 5) & 1; if (y_sign && (!compressed)) { return BAD_ENCODING; } // use a temporary buffer to mask the header bits and read a.x byte temp[Fp2_BYTES]; - memcpy(temp, bin, Fp2_BYTES); + memcpy(temp, in, Fp2_BYTES); temp[0] &= 0x1F; // clear the header bits ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp)); if (ret != VALID) { @@ -777,7 +784,7 @@ ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) { Fp2 *a_y = &(a->y); if (G2_SERIALIZATION == UNCOMPRESSED) { - ret = Fp2_read_bytes(a_y, bin + Fp2_BYTES, sizeof(a->y)); + ret = Fp2_read_bytes(a_y, in + Fp2_BYTES, sizeof(a->y)); if (ret != VALID) { return ret; } @@ -808,11 +815,11 @@ ERROR E2_read_bytes(E2 *a, const byte *bin, const int len) { // uncompressed form. It assumes buffer is of length G2_SER_BYTES The // serialization follows: // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -void E2_write_bytes(byte *bin, const E2 *a) { +void E2_write_bytes(byte *out, const E2 *a) { if (E2_is_infty(a)) { // set the infinity bit - bin[0] = (G2_SERIALIZATION << 7) | (1 << 6); - memset(bin + 1, 0, G2_SER_BYTES - 1); + out[0] = (G2_SERIALIZATION << 7) | (1 << 6); + memset(out + 1, 0, G2_SER_BYTES - 1); return; } E2 tmp; @@ -821,18 +828,18 @@ void E2_write_bytes(byte *bin, const E2 *a) { Fp2 *t_x = &(tmp.x); Fp_from_montg(&real(t_x), &real(t_x)); Fp_from_montg(&imag(t_x), &imag(t_x)); - Fp2_write_bytes(bin, t_x); + Fp2_write_bytes(out, t_x); Fp2 *t_y = &(tmp.y); if (G2_SERIALIZATION == COMPRESSED) { - bin[0] |= (Fp2_get_sign(t_y) << 5); + out[0] |= (Fp2_get_sign(t_y) << 5); } else { Fp_from_montg(&real(t_y), &real(t_y)); Fp_from_montg(&imag(t_y), &imag(t_y)); - Fp2_write_bytes(bin + Fp2_BYTES, t_y); + Fp2_write_bytes(out + Fp2_BYTES, t_y); } - bin[0] |= (G2_SERIALIZATION << 7); + out[0] |= (G2_SERIALIZATION << 7); } // set p to infinity @@ -940,10 +947,10 @@ void G2_mult_gen(E2 *res, const Fr *expo) { vec_zero(&tmp, sizeof(tmp)); } -// Exponentiation of generator g2 of G2, res = expo.g2 +// Exponentiation of generator g2 of G2, res = expo.g2. // -// This is useful for results being used multiple times in pairings. -// Conversion to affine saves later pre-pairing conversions. +// Result is converted to affine. This is useful for results being used multiple +// times in pairings. Conversion to affine saves later pre-pairing conversions. void G2_mult_gen_to_affine(E2 *res, const Fr *expo) { G2_mult_gen(res, expo); E2_to_affine(res, res); @@ -957,9 +964,9 @@ bool E2_in_G2(const E2 *p) { } // computes the sum of the E2 array elements `y[i]` and writes it in `sum` -void E2_sum_vector(E2 *sum, const E2 *y, const int len) { +void E2_sum_vector(E2 *sum, const E2 *y, const int y_len) { E2_set_infty(sum); - for (int i = 0; i < len; i++) { + for (int i = 0; i < y_len; i++) { E2_add(sum, sum, &y[i]); } } @@ -967,41 +974,41 @@ void E2_sum_vector(E2 *sum, const E2 *y, const int len) { // computes the sum of the E2 array elements `y[i]`, converts it // to affine coordinates, and writes it in `sum`. // -// This is useful for results being used multiple times in pairings. -// Conversion to affine saves later pre-pairing conversions. -void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int len) { - E2_sum_vector(sum, y, len); +// Result is converted to affine. This is useful for results being used multiple +// times in pairings. Conversion to affine saves later pre-pairing conversions. +void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int y_len) { + E2_sum_vector(sum, y, y_len); E2_to_affine(sum, sum); } // Subtracts all G2 array elements `y` from an element `x` and writes the -// result in res -void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len) { - E2_sum_vector(res, y, len); +// result in res. +void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int y_len) { + E2_sum_vector(res, y, y_len); E2_neg(res, res); E2_add(res, x, res); } // maps the bytes to a point in G2. -// `len` should be at least Fr_BYTES. +// `in_len` should be at least Fr_BYTES. // this is a testing tool only, it should not be used in any protocol! -void unsafe_map_bytes_to_G2(E2 *p, const byte *bytes, int len) { - assert(len >= Fr_BYTES); +void unsafe_map_bytes_to_G2(E2 *p, const byte *in, int in_len) { + assert(in_len >= Fr_BYTES); // map to Fr Fr log; - map_bytes_to_Fr(&log, bytes, len); + map_bytes_to_Fr(&log, in, in_len); // multiplies G2 generator by a random scalar G2_mult_gen(p, &log); } -// maps `bytes` to a point in E2\G2 and stores it in p. +// maps `in` to a point in E2\G2 and stores it in p. // `len` should be at least 192. // this is a testing tool only, it should not be used in any protocol! -void unsafe_map_bytes_to_G2complement(E2 *p, const byte *bytes, int len) { - assert(len >= 192); +void unsafe_map_bytes_to_G2complement(E2 *p, const byte *in, int in_len) { + assert(in_len >= 192); Fp2 u; - map_96_bytes_to_Fp(&real(&u), bytes, 96); - map_96_bytes_to_Fp(&imag(&u), bytes + 96, 96); + map_96_bytes_to_Fp(&real(&u), in, 96); + map_96_bytes_to_Fp(&imag(&u), in + 96, 96); // map to E2's isogenous and then to E2 map_to_isogenous_E2((POINTonE2 *)p, u); isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p); @@ -1080,6 +1087,8 @@ void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) { final_exp(res_vec, res_vec); } +// ------------------- Other utilities + // This is a testing function and is not used in exported functions // It uses an expand message XMD based on SHA2-256. void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, diff --git a/crypto/blst_include.h b/crypto/blst_include.h index dc942b5976b..5a3c47f0260 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -6,16 +6,16 @@ #include "fields.h" #include "point.h" -// types used by the Flow crypto library that are imported from BLST -// these type definitions are used as an abstraction from BLST internal types +// types used by the Flow crypto library that are imported from BLST. +// these type definitions are used as an abstraction from BLST internal types. // field elements F_r // where `r` is the order of G1/G2. // F_r elements are represented as big numbers reduced modulo `r`. Big numbers // are represented as a little endian vector of limbs. // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r -// elements). `Fr` is defined as a struct to be exportable through cgo to the Go -// layer. +// elements). `Fr` is defined as a struct so that it can be exportable through +// cgo to the Go layer. #define R_BITS 255 // equal to Fr_bits in bls12381_utils.h typedef struct { limb_t limbs[(R_BITS + 63) / 64]; @@ -30,7 +30,7 @@ typedef vec384 Fp; // curve E_1 (over F_p) // E_1 points are represented in Jacobian coordinates (x,y,z), -// where x, y, x are elements of F_p (type `Fp`). +// where x, y, z are elements of F_p (type `Fp`). // `E1` is equivalent to type `POINTonE1` (used internally by BLST for Jacobian // E1 elements) `E1` is defined as a struct to be exportable through cgo to the // Go layer. `E1` is also used to represent all subgroup G_1 elements. @@ -49,7 +49,7 @@ typedef vec384x Fp2; // curve E_2 (over F_p^2) // E_2 points are represented in Jacobian coordinates (x,y,z), -// where x, y, x are elements of F_p (type `Fp`). +// where x, y, z are elements of F_p (type `Fp`). // `E2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian // E2 elements) `E2` is defined as a struct to be exportable through cgo to the // Go layer. `E2` is also used to represent all subgroup G_2 elements. From 2332a61323bd698419834a5c1c8075b34d2f7ddb Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 20 Sep 2023 20:46:39 -0600 Subject: [PATCH 182/200] clean up threshold and dkg C files - use poly degree in secret sharing --- crypto/bls12381_utils.c | 1 - crypto/bls_core.c | 26 +++++++++++----------- crypto/bls_thresholdsign.go | 4 ++-- crypto/bls_thresholdsign_core.c | 38 ++++++++++++++++----------------- crypto/dkg_core.c | 26 ++++++++++++---------- 5 files changed, 49 insertions(+), 46 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 9f168e0b3e0..4b2d4ba0cc4 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -457,7 +457,6 @@ bool E1_affine_on_curve(const E1 *p) { // It assumes input `p` is on E1. bool E1_in_G1(const E1 *p) { // currently uses Scott method - // TODO: compare to clearing the cofactor using u-1 return POINTonE1_in_G1((const POINTonE1 *)p); } diff --git a/crypto/bls_core.c b/crypto/bls_core.c index aac7d60ee18..83c12480829 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -2,12 +2,10 @@ // this file is about the core functions required by the BLS signature scheme -// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) - -// Computes a BLS signature from a G1 point and writes it in `out`. +// Compute a BLS signature from a G1 point (not checked) and writes it in `out`. // `out` must be allocated properly with `G1_SER_BYTES` bytes. static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) { - // s = h^s + // s = h^sk E1 s; E1_mult(&s, h, sk); E1_write_bytes(out, &s); @@ -15,8 +13,8 @@ static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) { // Computes a BLS signature from a hash and writes it in `out`. // `hash` represents the hashed message with length `hash_len` equal to -// `MAP_TO_G1_INPUT_LEN`. `out` must be allocated properly with `G1_SER_BYTES` -// bytes. +// `MAP_TO_G1_INPUT_LEN`. +// `out` must be allocated properly with `G1_SER_BYTES` bytes. int bls_sign(byte *out, const Fr *sk, const byte *hash, const int hash_len) { // hash to G1 E1 h; @@ -33,7 +31,8 @@ extern const E2 *BLS12_381_minus_g2; // Verifies a BLS signature (G1 point) against a public key (G2 point) // and a message hash `h` (G1 point). // Hash, signature and public key are assumed to be in G1, G1 and G2 -// respectively. This function only checks the pairing equality. +// respectively. +// This function only checks the pairing equality. static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) { E1 elemsG1[2]; E2 elemsG2[2]; @@ -70,8 +69,9 @@ static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) { // membership check of pks in G2 is not verified in this function // the membership check is separated to allow optimizing multiple verifications // using the same pks -int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes, - const byte *hashes, const uint32_t *len_hashes, +int bls_verifyPerDistinctMessage(const byte *sig, + const int nb_hashes, const byte *hashes, + const uint32_t *len_hashes, const uint32_t *pks_per_hash, const E2 *pks) { int ret = UNDEFINED; // return value @@ -148,8 +148,8 @@ int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes, // the membership check is separated to allow optimizing multiple verifications // using the same pks int bls_verifyPerDistinctKey(const byte *sig, const int nb_pks, const E2 *pks, - const uint32_t *hashes_per_pk, const byte *hashes, - const uint32_t *len_hashes) { + const uint32_t *hashes_per_pk, + const byte *hashes, const uint32_t *len_hashes) { int ret = UNDEFINED; // return value @@ -464,8 +464,8 @@ void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input, // Membership check in G2 of both keys is not verified in this function. // the membership check in G2 is separated to allow optimizing multiple // verifications using the same public keys. -int bls_spock_verify(const E2 *pk1, const byte *sig1, const E2 *pk2, - const byte *sig2) { +int bls_spock_verify(const E2 *pk1, const byte *sig1, + const E2 *pk2, const byte *sig2) { E1 elemsG1[2]; E2 elemsG2[2]; diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 83fb6d6949f..c6ad1facd97 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -416,7 +416,7 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat result := C.E1_lagrange_interpolate_at_zero_write( (*C.uchar)(&thresholdSignature[0]), (*C.uchar)(&shares[0]), - (*C.uint8_t)(&signers[0]), (C.int)(s.threshold+1)) + (*C.uint8_t)(&signers[0]), (C.int)(s.threshold)) if result != valid { return nil, invalidSignatureError @@ -508,7 +508,7 @@ func BLSReconstructThresholdSignature(size int, threshold int, if C.E1_lagrange_interpolate_at_zero_write( (*C.uchar)(&thresholdSignature[0]), (*C.uchar)(&flatShares[0]), - (*C.uint8_t)(&indexSigners[0]), (C.int)(threshold+1), + (*C.uint8_t)(&indexSigners[0]), (C.int)(threshold), ) != valid { return nil, invalidSignatureError } diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index dc7e1354907..7bbe526121a 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -6,9 +6,10 @@ // Computes the Lagrange coefficient L_i(0) in Fr with regards to the range // [indices(0)..indices(t)] and stores it in `res`, where t is the degree of the -// polynomial P. `len` is equal to `t+1` where `t` is the polynomial degree. +// polynomial P. +// `degree` is equal to the polynomial degree `t`. static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, - const byte indices[], const int len) { + const byte indices[], const int degree) { // coefficient is computed as N * D^(-1) Fr numerator; // eventually would represent N*R^k @@ -24,15 +25,14 @@ static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately // 64/MAX_IND_BITS) this means we can multiply up to (k) indices in a limb (64 // bits) without overflowing. -#define MAX_IND_LOOPS (64 / MAX_IND_BITS) - const int loops = MAX_IND_LOOPS; + const int loops = 64 / MAX_IND_BITS; int k, j = 0; Fr tmp; - while (j < len) { + while (j < degree+1) { limb_t limb_numerator = 1; limb_t limb_denominator = 1; - for (k = j; j < MIN(len, k + loops); - j++) { // batch up to `loops` elements in one limb + // batch up to `loops` elements in one limb + for (k = j; j < MIN(degree+1, k + loops); j++) { if (j == i) continue; if (indices[j] < indices[i]) { @@ -65,11 +65,11 @@ static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the // indices [indices(0)..indices(t)] and their G1 images [shares(0)..shares(t)], -// and stores the resulting G1 point in `dest`. `len` is equal to `t+1` where -// `t` is the polynomial degree. +// and stores the resulting G1 point in `dest`. +// `degree` is equal to the polynomial degree `t`. static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[], const byte indices[], - const int len) { + const int degree) { // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1 // where A_i = g1 ^ a_i @@ -79,22 +79,22 @@ static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[], E1_set_infty(out); Fr fr_lagr_coef; E1 mult; - for (int i = 0; i < len; i++) { - Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, len); + for (int i = 0; i < degree+1; i++) { + Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, degree); E1_mult(&mult, &shares[i], &fr_lagr_coef); E1_add(out, out, &mult); } } -// Computes the Langrange interpolation at zero LI(0) with regards to the +// Computes the Lagrange interpolation at zero LI(0) with regards to the // indices [indices(0)..indices(t)] and writes their E1 concatenated -// serializations [shares(1)..shares(t+1)] in `dest`. `len` is equal to `t+1` -// where `t` is the polynomial degree. +// serializations [shares(1)..shares(t+1)] in `dest`. +// `degree` is equal to the polynomial degree `t`. int E1_lagrange_interpolate_at_zero_write(byte *dest, const byte *shares, - const byte indices[], const int len) { + const byte indices[], const int degree) { int read_ret; - E1 *E1_shares = malloc(sizeof(E1) * len); - for (int i = 0; i < len; i++) { + E1 *E1_shares = malloc(sizeof(E1) * (degree+1)); + for (int i = 0; i < degree+1; i++) { read_ret = E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES); if (read_ret != VALID) { @@ -106,7 +106,7 @@ int E1_lagrange_interpolate_at_zero_write(byte *dest, const byte *shares, // computes Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1, // where A_i = g1 ^ a_i E1 res; - E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, len); + E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, degree); // export the result E1_write_bytes(dest, &res); read_ret = VALID; diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index f5f48db67ae..3dab93b9fc7 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -13,8 +13,8 @@ void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int degree, // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. // a_i are all in Fr, `degree` is P's degree, x is a small integer less than -// 255. The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non -// NULL +// `MAX_IND` (currently 255). +// The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL. void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree, const byte x) { Fr_set_zero(image); @@ -34,7 +34,9 @@ void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree, } // computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 -// and stores the point in y +// and stores the point in y. +// - A_i being G2 points +// - x being a small scalar (less than `MAX_IND`) static void E2_polynomial_image(E2 *y, const E2 *A, const int degree, const byte x) { E2_set_infty(y); @@ -45,7 +47,9 @@ static void E2_polynomial_image(E2 *y, const E2 *A, const int degree, } // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y) -// where Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2[X] +// where Q(x) = A_0 + A_1*x + ... + A_n*x^n +// - A_i being G2 points +// - x being a small scalar (less than `MAX_IND`) void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, const int degree) { for (byte i = 0; i < len_y; i++) { @@ -56,17 +60,17 @@ void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, // export an array of G2 into an array of bytes by concatenating // all serializations of G2 points in order. -// the array must be of length (len * G2_SER_BYTES). -void G2_vector_write_bytes(byte *out, const E2 *A, const int len) { +// the array must be of length (A_len * G2_SER_BYTES). +void G2_vector_write_bytes(byte *out, const E2 *A, const int A_len) { byte *p = out; - for (int i = 0; i < len; i++) { + for (int i = 0; i < A_len; i++) { E2_write_bytes(p, &A[i]); p += G2_SER_BYTES; } } -// The function imports an array of `n` E2 points from a concatenated array of -// bytes. The bytes array is supposed to be of size (n * G2_SER_BYTES). +// The function imports an array of `A_len` E2 points from a concatenated array of +// bytes. The bytes array is supposed to be of size (A_len * G2_SER_BYTES). // // If return is `VALID`, output vector is guaranteed to be in G2. // It returns other errors if at least one input isn't a serialization of a E2 @@ -80,9 +84,9 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int len) { // E2. // - POINT_NOT_IN_GROUP if at least one E2 point isn't in G2. // - VALID if deserialization of all points to G2 is valid. -ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int n) { +ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int A_len) { byte *p = (byte *)src; - for (int i = 0; i < n; i++) { + for (int i = 0; i < A_len; i++) { int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES); if (read_ret != VALID) { return read_ret; From 2cd3d283705e2628b89a4d6920ad5f010690c260 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 20 Sep 2023 21:03:03 -0600 Subject: [PATCH 183/200] format c files --- crypto/bls12381_utils.c | 18 +++++++++--------- crypto/bls12381_utils.h | 5 ++--- crypto/bls_core.c | 17 ++++++++--------- crypto/bls_thresholdsign_core.c | 25 +++++++++++++------------ crypto/blst_include.h | 2 +- crypto/dkg_core.c | 14 +++++++------- crypto/dkg_include.h | 2 +- 7 files changed, 41 insertions(+), 42 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 4b2d4ba0cc4..528f865cfdd 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1,6 +1,6 @@ // this file contains utility functions for the curve BLS 12-381 // these tools are shared by the BLS signature scheme, the BLS based threshold -// signature and the BLS distributed key generation protocols +// signature, BLS-SPoCK and the BLS distributed key generation protocols #include "bls12381_utils.h" #include "assert.h" @@ -18,7 +18,7 @@ void types_sanity(void) { assert(sizeof(E2) == sizeof(POINTonE2)); assert(sizeof(vec384fp12) == sizeof(Fp12)); } - + // ------------------- Fr utilities // Montgomery constant R related to the curve order r @@ -78,8 +78,8 @@ void Fr_squ_montg(Fr *res, const Fr *a) { // res = a*R void Fr_to_montg(Fr *res, const Fr *a) { - mul_mont_sparse_256((limb_t *)res, (limb_t *)a, - BLS12_381_rRR, BLS12_381_r, r0); + mul_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_rRR, BLS12_381_r, + r0); } // res = a*R^(-1) @@ -273,8 +273,8 @@ static bool Fp_check(const Fp *a) { Fp temp; Fp_add(&temp, a, &ZERO_384); return vec_is_equal(&temp, a, Fp_BYTES); - // no need to clear `tmp` as no current use-case involves sensitive data being passed - // as `a` + // no need to clear `tmp` as no current use-case involves sensitive data being + // passed as `a` } // res = a*b*R^(-1) @@ -606,7 +606,7 @@ void E1_sum_vector(E1 *sum, const E1 *y, const int len) { // Computes the sum of input E1 elements flattened in a single byte // array `in_bytes` of `in_len` bytes. and writes the sum (E1 element) as -// bytes in `out`. +// bytes in `out`. // The function does not check membership of E1 inputs in G1 // subgroup. The header is using byte pointers to minimize Cgo calls from the Go // layer. @@ -948,7 +948,7 @@ void G2_mult_gen(E2 *res, const Fr *expo) { // Exponentiation of generator g2 of G2, res = expo.g2. // -// Result is converted to affine. This is useful for results being used multiple +// Result is converted to affine. This is useful for results being used multiple // times in pairings. Conversion to affine saves later pre-pairing conversions. void G2_mult_gen_to_affine(E2 *res, const Fr *expo) { G2_mult_gen(res, expo); @@ -973,7 +973,7 @@ void E2_sum_vector(E2 *sum, const E2 *y, const int y_len) { // computes the sum of the E2 array elements `y[i]`, converts it // to affine coordinates, and writes it in `sum`. // -// Result is converted to affine. This is useful for results being used multiple +// Result is converted to affine. This is useful for results being used multiple // times in pairings. Conversion to affine saves later pre-pairing conversions. void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int y_len) { E2_sum_vector(sum, y, y_len); diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index b0f96669ed7..923208ef3f3 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -1,6 +1,6 @@ // this file contains utility functions for the curve BLS 12-381 // these tools are shared by the BLS signature scheme, the BLS based threshold -// signature and the BLS distributed key generation protocols +// signature, BLS-SPoCK and the BLS distributed key generation protocols #ifndef _BLS12_381_UTILS_H #define _BLS12_381_UTILS_H @@ -101,8 +101,7 @@ void unsafe_map_bytes_to_G1(E1 *, const byte *, int); void unsafe_map_bytes_to_G1complement(E1 *, const byte *, int); #define MAP_TO_G1_INPUT_LEN (2 * (Fp_BYTES + SEC_BITS / 8)) -int map_to_G1(E1 *, const byte *, - const int); // functions in bls12381_hashtocurve.c +int map_to_G1(E1 *, const byte *, const int); // E2 and G2 utilities void E2_set_infty(E2 *p); diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 83c12480829..19d29f46713 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -13,7 +13,7 @@ static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) { // Computes a BLS signature from a hash and writes it in `out`. // `hash` represents the hashed message with length `hash_len` equal to -// `MAP_TO_G1_INPUT_LEN`. +// `MAP_TO_G1_INPUT_LEN`. // `out` must be allocated properly with `G1_SER_BYTES` bytes. int bls_sign(byte *out, const Fr *sk, const byte *hash, const int hash_len) { // hash to G1 @@ -31,7 +31,7 @@ extern const E2 *BLS12_381_minus_g2; // Verifies a BLS signature (G1 point) against a public key (G2 point) // and a message hash `h` (G1 point). // Hash, signature and public key are assumed to be in G1, G1 and G2 -// respectively. +// respectively. // This function only checks the pairing equality. static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) { E1 elemsG1[2]; @@ -69,9 +69,8 @@ static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) { // membership check of pks in G2 is not verified in this function // the membership check is separated to allow optimizing multiple verifications // using the same pks -int bls_verifyPerDistinctMessage(const byte *sig, - const int nb_hashes, const byte *hashes, - const uint32_t *len_hashes, +int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes, + const byte *hashes, const uint32_t *len_hashes, const uint32_t *pks_per_hash, const E2 *pks) { int ret = UNDEFINED; // return value @@ -148,8 +147,8 @@ int bls_verifyPerDistinctMessage(const byte *sig, // the membership check is separated to allow optimizing multiple verifications // using the same pks int bls_verifyPerDistinctKey(const byte *sig, const int nb_pks, const E2 *pks, - const uint32_t *hashes_per_pk, - const byte *hashes, const uint32_t *len_hashes) { + const uint32_t *hashes_per_pk, const byte *hashes, + const uint32_t *len_hashes) { int ret = UNDEFINED; // return value @@ -464,8 +463,8 @@ void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input, // Membership check in G2 of both keys is not verified in this function. // the membership check in G2 is separated to allow optimizing multiple // verifications using the same public keys. -int bls_spock_verify(const E2 *pk1, const byte *sig1, - const E2 *pk2, const byte *sig2) { +int bls_spock_verify(const E2 *pk1, const byte *sig1, const E2 *pk2, + const byte *sig2) { E1 elemsG1[2]; E2 elemsG2[2]; diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index 7bbe526121a..7c1d809d228 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -6,7 +6,7 @@ // Computes the Lagrange coefficient L_i(0) in Fr with regards to the range // [indices(0)..indices(t)] and stores it in `res`, where t is the degree of the -// polynomial P. +// polynomial P. // `degree` is equal to the polynomial degree `t`. static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, const byte indices[], const int degree) { @@ -22,17 +22,17 @@ static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, // sign of D: 0 for positive and 1 for negative int sign = 0; -// the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately -// 64/MAX_IND_BITS) this means we can multiply up to (k) indices in a limb (64 -// bits) without overflowing. + // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately + // 64/MAX_IND_BITS) this means we can multiply up to (k) indices in a limb (64 + // bits) without overflowing. const int loops = 64 / MAX_IND_BITS; int k, j = 0; Fr tmp; - while (j < degree+1) { + while (j < degree + 1) { limb_t limb_numerator = 1; limb_t limb_denominator = 1; // batch up to `loops` elements in one limb - for (k = j; j < MIN(degree+1, k + loops); j++) { + for (k = j; j < MIN(degree + 1, k + loops); j++) { if (j == i) continue; if (indices[j] < indices[i]) { @@ -65,7 +65,7 @@ static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, // Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the // indices [indices(0)..indices(t)] and their G1 images [shares(0)..shares(t)], -// and stores the resulting G1 point in `dest`. +// and stores the resulting G1 point in `dest`. // `degree` is equal to the polynomial degree `t`. static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[], const byte indices[], @@ -79,7 +79,7 @@ static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[], E1_set_infty(out); Fr fr_lagr_coef; E1 mult; - for (int i = 0; i < degree+1; i++) { + for (int i = 0; i < degree + 1; i++) { Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, degree); E1_mult(&mult, &shares[i], &fr_lagr_coef); E1_add(out, out, &mult); @@ -88,13 +88,14 @@ static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[], // Computes the Lagrange interpolation at zero LI(0) with regards to the // indices [indices(0)..indices(t)] and writes their E1 concatenated -// serializations [shares(1)..shares(t+1)] in `dest`. +// serializations [shares(1)..shares(t+1)] in `dest`. // `degree` is equal to the polynomial degree `t`. int E1_lagrange_interpolate_at_zero_write(byte *dest, const byte *shares, - const byte indices[], const int degree) { + const byte indices[], + const int degree) { int read_ret; - E1 *E1_shares = malloc(sizeof(E1) * (degree+1)); - for (int i = 0; i < degree+1; i++) { + E1 *E1_shares = malloc(sizeof(E1) * (degree + 1)); + for (int i = 0; i < degree + 1; i++) { read_ret = E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES); if (read_ret != VALID) { diff --git a/crypto/blst_include.h b/crypto/blst_include.h index 5a3c47f0260..d5eb5079cfd 100644 --- a/crypto/blst_include.h +++ b/crypto/blst_include.h @@ -14,7 +14,7 @@ // F_r elements are represented as big numbers reduced modulo `r`. Big numbers // are represented as a little endian vector of limbs. // `Fr` is equivalent to type `vec256` (used internally by BLST for F_r -// elements). `Fr` is defined as a struct so that it can be exportable through +// elements). `Fr` is defined as a struct so that it can be exportable through // cgo to the Go layer. #define R_BITS 255 // equal to Fr_bits in bls12381_utils.h typedef struct { diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 3dab93b9fc7..c8fee6917f6 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -13,7 +13,7 @@ void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int degree, // computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. // a_i are all in Fr, `degree` is P's degree, x is a small integer less than -// `MAX_IND` (currently 255). +// `MAX_IND` (currently 255). // The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL. void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree, const byte x) { @@ -47,7 +47,7 @@ static void E2_polynomial_image(E2 *y, const E2 *A, const int degree, } // computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y) -// where Q(x) = A_0 + A_1*x + ... + A_n*x^n +// where Q(x) = A_0 + A_1*x + ... + A_n*x^n // - A_i being G2 points // - x being a small scalar (less than `MAX_IND`) void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, @@ -58,10 +58,10 @@ void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, } } -// export an array of G2 into an array of bytes by concatenating -// all serializations of G2 points in order. +// export an array of E2 into an array of bytes by concatenating +// all serializations of E2 points in order. // the array must be of length (A_len * G2_SER_BYTES). -void G2_vector_write_bytes(byte *out, const E2 *A, const int A_len) { +void E2_vector_write_bytes(byte *out, const E2 *A, const int A_len) { byte *p = out; for (int i = 0; i < A_len; i++) { E2_write_bytes(p, &A[i]); @@ -69,8 +69,8 @@ void G2_vector_write_bytes(byte *out, const E2 *A, const int A_len) { } } -// The function imports an array of `A_len` E2 points from a concatenated array of -// bytes. The bytes array is supposed to be of size (A_len * G2_SER_BYTES). +// The function imports an array of `A_len` E2 points from a concatenated array +// of bytes. The bytes array is supposed to be of size (A_len * G2_SER_BYTES). // // If return is `VALID`, output vector is guaranteed to be in G2. // It returns other errors if at least one input isn't a serialization of a E2 diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index 05d46187749..02fb9a363f4 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -8,7 +8,7 @@ void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int deg, void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int deg, const byte x); void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, const int deg); -void G2_vector_write_bytes(byte *out, const E2 *A, const int len); +void E2_vector_write_bytes(byte *out, const E2 *A, const int len); ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int len); bool G2_check_log(const Fr *x, const E2 *y); From 262c3e0f332153b6e15f113f7cf9ff05fcc5a50d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Wed, 20 Sep 2023 21:03:22 -0600 Subject: [PATCH 184/200] rename G2_ prefix to E2_ --- crypto/bls.go | 7 ++++--- crypto/bls12381_utils.go | 8 ++++---- crypto/dkg_feldmanvss.go | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index c6f01a6ab28..c33a90fdce6 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -37,11 +37,12 @@ import ( ) const ( - // SignatureLenBLSBLS12381 is the size of a `G_1` element. + // SignatureLenBLSBLS12381 is the serialization size of a `G_1` element. SignatureLenBLSBLS12381 = g1BytesLen - // PubKeyLenBLSBLS12381 is the size of a `G_2` element. + // PubKeyLenBLSBLS12381 is the serialization size of a `G_2` element. PubKeyLenBLSBLS12381 = g2BytesLen - // PrKeyLenBLSBLS12381 is the size of a `F_r` element, where `r` is the order of `G_1` and `G_2`. + // PrKeyLenBLSBLS12381 is the serialization size of a `F_r` element, + // where `r` is the order of `G_1` and `G_2`. PrKeyLenBLSBLS12381 = frBytesLen // Hash to curve params diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index a3867b31b20..adfde987cfe 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -61,8 +61,8 @@ const ( ) // header of the point at infinity serializations -var g1SerHeader byte // g1 -var g2SerHeader byte // g2 +var g1SerHeader byte // g1 (G1 identity) +var g2SerHeader byte // g2 (G2 identity) // `g1“ serialization var g1Serialization []byte @@ -214,7 +214,7 @@ func readScalarFrStar(a *scalar, src []byte) error { return invalidInputsErrorf("input length must be %d, got %d", frBytesLen, len(src)) case badValue: - return invalidInputsErrorf("scalar is not in the correct range w.r.t the BLS12-381 curve") + return invalidInputsErrorf("scalar is not in the correct range") default: return invalidInputsErrorf("reading the scalar failed") } @@ -233,7 +233,7 @@ func readPointE2(a *pointE2, src []byte) error { case valid: return nil case badEncoding, badValue: - return invalidInputsErrorf("input could not deserialize to a E2 point") + return invalidInputsErrorf("input could not deserialize to an E2 point") case pointNotOnCurve: return invalidInputsErrorf("input is not a point on curve E2") default: diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 2814e59ee14..3ce7f609f95 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -450,7 +450,7 @@ func frPolynomialImage(dest []byte, a []scalar, x index, y *pointE2) { // writeVerifVector exports a vector A into an array of bytes // assuming the array length matches the vector length func writeVerifVector(dest []byte, A []pointE2) { - C.G2_vector_write_bytes((*C.uchar)(&dest[0]), + C.E2_vector_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(&A[0]), (C.int)(len(A)), ) From dc28e03c607b3afb23c8c8ff56c249211eb74c08 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 10 Oct 2023 14:15:02 -0500 Subject: [PATCH 185/200] move godoc closer to the type definition --- crypto/bls.go | 11 ++++------- crypto/bls12381_utils.go | 4 +++- crypto/bls_thresholdsign.go | 2 +- crypto/dkg_feldmanvss.go | 2 +- crypto/ecdsa.go | 10 ++++------ 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/crypto/bls.go b/crypto/bls.go index c33a90fdce6..27ddd881bfd 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -301,7 +301,6 @@ func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, sk := newPrKeyBLSBLS12381(nil) err := readScalarFrStar(&sk.scalar, privateKeyBytes) - if err != nil { return nil, fmt.Errorf("failed to read the private key: %w", err) } @@ -347,9 +346,6 @@ func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (Publ } // prKeyBLSBLS12381 is the private key of BLS using BLS12_381, it implements PrivateKey - -var _ PrivateKey = (*prKeyBLSBLS12381)(nil) - type prKeyBLSBLS12381 struct { // public key pk *pubKeyBLSBLS12381 @@ -357,6 +353,8 @@ type prKeyBLSBLS12381 struct { scalar scalar } +var _ PrivateKey = (*prKeyBLSBLS12381)(nil) + // newPrKeyBLSBLS12381 creates a new BLS private key with the given scalar. // If no scalar is provided, the function allocates an // empty scalar. @@ -427,9 +425,6 @@ func (sk *prKeyBLSBLS12381) String() string { // pubKeyBLSBLS12381 is the public key of BLS using BLS12_381, // it implements PublicKey. - -var _ PublicKey = (*pubKeyBLSBLS12381)(nil) - type pubKeyBLSBLS12381 struct { // The package guarantees an instance is only created with a point // on the correct G2 subgroup. No membership check is needed when the @@ -446,6 +441,8 @@ type pubKeyBLSBLS12381 struct { isIdentity bool } +var _ PublicKey = (*pubKeyBLSBLS12381)(nil) + // newPubKeyBLSBLS12381 creates a new BLS public key with the given point. // If no scalar is provided, the function allocates an // empty scalar. diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index adfde987cfe..41937bc18c2 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -64,7 +64,7 @@ const ( var g1SerHeader byte // g1 (G1 identity) var g2SerHeader byte // g2 (G2 identity) -// `g1“ serialization +// `g1` serialization var g1Serialization []byte var g2PublicKey pubKeyBLSBLS12381 @@ -89,12 +89,14 @@ func initBLS12381() { g2PublicKey.isIdentity = true } +// String returns a hex-encoded representation of the scalar. func (a *scalar) String() string { encoding := make([]byte, frBytesLen) writeScalar(encoding, a) return fmt.Sprintf("%#x", encoding) } +// String returns a hex-encoded representation of the E2 point. func (p *pointE2) String() string { encoding := make([]byte, g2BytesLen) writePointE2(encoding, p) diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index c6ad1facd97..efe660570db 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -536,7 +536,7 @@ func EnoughShares(threshold int, sharesNumber int) (bool, error) { // threshold signature scheme with a trusted dealer. // // The function returns : -// - (nil, nil, nil, invalidInputsErrorf) if: +// - (nil, nil, nil, invalidInputsErrorf) if: // - seed is too short // - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] // - threshold value is not in interval [1, n-1] diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go index 3ce7f609f95..dbe7771b6c4 100644 --- a/crypto/dkg_feldmanvss.go +++ b/crypto/dkg_feldmanvss.go @@ -395,7 +395,7 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) { } } -// receives the public vector from the +// receives the public vector from the dealer func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { // only accept the verification vector from the dealer. if origin != s.dealerIndex { diff --git a/crypto/ecdsa.go b/crypto/ecdsa.go index 67d97e9a854..b09d3d5922f 100644 --- a/crypto/ecdsa.go +++ b/crypto/ecdsa.go @@ -322,9 +322,6 @@ func (a *ecdsaAlgo) decodePublicKeyCompressed(pkBytes []byte) (PublicKey, error) } // prKeyECDSA is the private key of ECDSA, it implements the interface PrivateKey - -var _ PrivateKey = (*prKeyECDSA)(nil) - type prKeyECDSA struct { // the signature algo alg *ecdsaAlgo @@ -334,6 +331,8 @@ type prKeyECDSA struct { pubKey *pubKeyECDSA } +var _ PrivateKey = (*prKeyECDSA)(nil) + // Algorithm returns the algo related to the private key func (sk *prKeyECDSA) Algorithm() SigningAlgorithm { return sk.alg.algo @@ -395,9 +394,6 @@ func (sk *prKeyECDSA) String() string { } // pubKeyECDSA is the public key of ECDSA, it implements PublicKey - -var _ PublicKey = (*pubKeyECDSA)(nil) - type pubKeyECDSA struct { // the signature algo alg *ecdsaAlgo @@ -405,6 +401,8 @@ type pubKeyECDSA struct { goPubKey *ecdsa.PublicKey } +var _ PublicKey = (*pubKeyECDSA)(nil) + // Algorithm returns the the algo related to the private key func (pk *pubKeyECDSA) Algorithm() SigningAlgorithm { return pk.alg.algo From 627d682bfb2b9bd3b2064f7d450958aa8f86368d Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 10 Oct 2023 19:36:05 -0500 Subject: [PATCH 186/200] add E1 random point multiplication benchmark --- crypto/bls12381_utils.go | 6 +++--- crypto/bls12381_utils_test.go | 18 +++++++++++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 41937bc18c2..65a54bb9dd4 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -103,8 +103,8 @@ func (p *pointE2) String() string { return fmt.Sprintf("%#x", encoding) } -// Scalar multiplication of a generic point `p` in G1 -func (p *pointE1) scalarMultG1(res *pointE1, expo *scalar) { +// Scalar multiplication of a generic point `p` in E1 +func (p *pointE1) scalarMultE1(res *pointE1, expo *scalar) { C.E1_mult((*C.E1)(res), (*C.E1)(p), (*C.Fr)(expo)) } @@ -165,7 +165,7 @@ func randFr(x *scalar, rand random.Rand) bool { // and saves the random in `x`. func randFrStar(x *scalar, rand random.Rand) { isZero := true - // exteremely unlikely this loop runs more than once, + // extremely unlikely this loop runs more than once, // but force the output to be non-zero instead of propagating an error. for isZero { isZero = randFr(x, rand) diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index ade31bbb6b9..e71702d7cbf 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -63,8 +63,8 @@ func BenchmarkScalarMult(b *testing.B) { // G1 generator multiplication // Note that generator and random point multiplications // are implemented with the same algorithm - b.Run("G1", func(b *testing.B) { - var res pointE1 + var res pointE1 + b.Run("G1 gen", func(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { generatorScalarMultG1(&res, &expo) @@ -72,10 +72,22 @@ func BenchmarkScalarMult(b *testing.B) { b.StopTimer() }) + // E1 random point multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm + b.Run("E1 rand", func(b *testing.B) { + var res pointE1 + b.ResetTimer() + for i := 0; i < b.N; i++ { + res.scalarMultE1(&res, &expo) + } + b.StopTimer() + }) + // G2 generator multiplication // Note that generator and random point multiplications // are implemented with the same algorithm - b.Run("G2", func(b *testing.B) { + b.Run("G2 gen", func(b *testing.B) { var res pointE2 b.ResetTimer() for i := 0; i < b.N; i++ { From cf8667b93c5d1978a196a3464a42530324573570 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 10 Oct 2023 19:40:00 -0500 Subject: [PATCH 187/200] remove StopTimer in bench --- crypto/bls12381_utils_test.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index e71702d7cbf..257ec1afa1b 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -69,7 +69,6 @@ func BenchmarkScalarMult(b *testing.B) { for i := 0; i < b.N; i++ { generatorScalarMultG1(&res, &expo) } - b.StopTimer() }) // E1 random point multiplication @@ -81,7 +80,6 @@ func BenchmarkScalarMult(b *testing.B) { for i := 0; i < b.N; i++ { res.scalarMultE1(&res, &expo) } - b.StopTimer() }) // G2 generator multiplication @@ -93,7 +91,6 @@ func BenchmarkScalarMult(b *testing.B) { for i := 0; i < b.N; i++ { generatorScalarMultG2(&res, &expo) } - b.StopTimer() }) } @@ -145,7 +142,6 @@ func BenchmarkMapToG1(b *testing.B) { p = mapToG1(input) } require.NotNil(b, p) - b.StopTimer() } // test subgroup membership check in G1 and G2 @@ -187,7 +183,6 @@ func BenchmarkSubgroupCheck(b *testing.B) { for i := 0; i < b.N; i++ { _ = checkMembershipG1(&p) // G1 } - b.StopTimer() }) b.Run("G2", func(b *testing.B) { @@ -197,7 +192,6 @@ func BenchmarkSubgroupCheck(b *testing.B) { for i := 0; i < b.N; i++ { _ = checkMembershipG2(&p) // G2 } - b.StopTimer() }) } From 5f89c65df3ec9813c76d76644e478d5a96e0c256 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 10 Oct 2023 19:52:08 -0500 Subject: [PATCH 188/200] comment updates and reformat --- crypto/bls12381_utils_test.go | 2 +- crypto/bls_thresholdsign.go | 18 +++++++++--------- crypto/bls_thresholdsign_test.go | 2 -- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index 257ec1afa1b..a528e240363 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -220,7 +220,7 @@ func TestReadWriteG1(t *testing.T) { t.Run("infinity", func(t *testing.T) { var p, q pointE1 seed := make([]byte, frBytesLen) - unsafeMapToG1(&p, seed) // this results in the infinity point + unsafeMapToG1(&p, seed) // this results in the infinity point given how `unsafeMapToG1` works with an empty scalar writePointE1(bytes, &p) require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check err := readPointE1(&q, bytes) diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index efe660570db..412f06f962a 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -33,8 +33,6 @@ import ( // blsThresholdSignatureParticipant implements ThresholdSignatureParticipant // based on the BLS signature scheme -var _ ThresholdSignatureParticipant = (*blsThresholdSignatureParticipant)(nil) - type blsThresholdSignatureParticipant struct { // embed the follower *blsThresholdSignatureInspector @@ -44,10 +42,10 @@ type blsThresholdSignatureParticipant struct { myPrivateKey PrivateKey } +var _ ThresholdSignatureParticipant = (*blsThresholdSignatureParticipant)(nil) + // blsThresholdSignatureInspector implements ThresholdSignatureInspector // based on the BLS signature scheme -var _ ThresholdSignatureInspector = (*blsThresholdSignatureInspector)(nil) - type blsThresholdSignatureInspector struct { // size of the group size int @@ -72,6 +70,8 @@ type blsThresholdSignatureInspector struct { lock sync.RWMutex } +var _ ThresholdSignatureInspector = (*blsThresholdSignatureInspector)(nil) + // NewBLSThresholdSignatureParticipant creates a new instance of Threshold signature Participant using BLS. // A participant is able to participate in a threshold signing protocol as well as following the // protocol. @@ -82,8 +82,8 @@ type blsThresholdSignatureInspector struct { // participant is indexed by `myIndex` and holds the input private key // where n is the length of the public key shares slice. // -// The function returns -// - (nil, invalidInputsError) if: +// The function returns: +// - (nil, invalidInputsError) if: // - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] // - threshold value is not in interval [1, n-1] // - input private key and public key at my index do not match @@ -138,8 +138,8 @@ func NewBLSThresholdSignatureParticipant( // Participants are defined by their public key share, and are indexed from 0 to n-1 // where n is the length of the public key shares slice. // -// The function returns -// - (nil, invalidInputsError) if: +// The function returns: +// - (nil, invalidInputsError) if: // - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] // - threshold value is not in interval [1, n-1] // - (nil, notBLSKeyError) at least one public key is not of type pubKeyBLSBLS12381 @@ -535,7 +535,7 @@ func EnoughShares(threshold int, sharesNumber int) (bool, error) { // BLSThresholdKeyGen is a key generation for a BLS-based // threshold signature scheme with a trusted dealer. // -// The function returns : +// The function returns: // - (nil, nil, nil, invalidInputsErrorf) if: // - seed is too short // - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go index 20d578db264..9f3f83cb387 100644 --- a/crypto/bls_thresholdsign_test.go +++ b/crypto/bls_thresholdsign_test.go @@ -618,7 +618,6 @@ func BenchmarkSimpleKeyGen(b *testing.B) { for i := 0; i < b.N; i++ { _, _, _, _ = BLSThresholdKeyGen(n, optimalThreshold(n), seed) } - b.StopTimer() } func BenchmarkSignatureReconstruction(b *testing.B) { @@ -647,5 +646,4 @@ func BenchmarkSignatureReconstruction(b *testing.B) { _, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers) require.NoError(b, err) } - b.StopTimer() } From 0587bc6e72ae1ee226bfdcd065236c77c4e0a439 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Thu, 12 Oct 2023 13:40:17 -0500 Subject: [PATCH 189/200] fix non-freed memory in error case --- crypto/bls12381_utils.c | 11 ++++++----- crypto/bls_core.c | 17 ++++++++++------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 528f865cfdd..fc29046e47f 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -11,12 +11,12 @@ // make sure flow crypto types are consistent with BLST types void types_sanity(void) { - assert(sizeof(vec256) == sizeof(Fr)); + assert(sizeof(Fr) == sizeof(vec256)); assert(sizeof(Fp) == sizeof(vec384)); - assert(sizeof(vec384x) == sizeof(Fp2)); + assert(sizeof(Fp2) == sizeof(vec384x)); assert(sizeof(E1) == sizeof(POINTonE1)); assert(sizeof(E2) == sizeof(POINTonE2)); - assert(sizeof(vec384fp12) == sizeof(Fp12)); + assert(sizeof(Fp12) == sizeof(vec384fp12)); } // ------------------- Fr utilities @@ -556,9 +556,9 @@ ERROR E1_read_bytes(E1 *a, const byte *in, const int in_len) { // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) void E1_write_bytes(byte *out, const E1 *a) { if (E1_is_infty(a)) { + memset(out, 0, G1_SER_BYTES); // set the infinity bit out[0] = (G1_SERIALIZATION << 7) | (1 << 6); - memset(out + 1, 0, G1_SER_BYTES - 1); return; } E1 tmp; @@ -620,8 +620,9 @@ int E1_sum_vector_byte(byte *out, const byte *in_bytes, const int in_len) { int n = in_len / G1_SER_BYTES; // number of signatures E1 *vec = (E1 *)malloc(n * sizeof(E1)); - if (!vec) + if (!vec) { goto mem_error; + } // import the points from the array for (int i = 0; i < n; i++) { diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 19d29f46713..65f510f5987 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -401,11 +401,18 @@ void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input, // build the arrays of G1 and G2 elements to verify E2 *pks = (E2 *)malloc(sigs_len * sizeof(E2)); - if (!pks) + if (!pks) { return; + } E1 *sigs = (E1 *)malloc(sigs_len * sizeof(E1)); - if (!sigs) + if (!sigs) { goto out_sigs; + } + + E1 h; + if (map_to_G1(&h, data, data_len) != VALID) { + goto out; + } for (int i = 0; i < sigs_len; i++) { // convert the signature points: @@ -440,11 +447,7 @@ void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input, } // build a binary tree of aggregations node *root = build_tree(sigs_len, &pks[0], &sigs[0]); - if (!root) - goto out; - - E1 h; - if (map_to_G1(&h, data, data_len) != VALID) { + if (!root) { goto out; } From f64d5ea4fdd97372e35cde00d37092734e2bb3f3 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 20 Oct 2023 13:36:14 -0500 Subject: [PATCH 190/200] use a common logic to detect ADX support and set the crypto flag --- Makefile | 17 +---------------- crypto_adx_flag.mk | 17 +++++++++++++++++ insecure/Makefile | 17 +---------------- integration/Makefile | 17 +---------------- 4 files changed, 20 insertions(+), 48 deletions(-) create mode 100644 crypto_adx_flag.mk diff --git a/Makefile b/Makefile index 874d56f8d72..18415b12a16 100644 --- a/Makefile +++ b/Makefile @@ -39,23 +39,8 @@ K8S_YAMLS_LOCATION_STAGING=./k8s/staging export CONTAINER_REGISTRY := gcr.io/flow-container-registry export DOCKER_BUILDKIT := 1 -# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. -ifeq ($(shell uname -s),Linux) -# detect ADX support on the CURRENT linux machine. - ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) -else -# on non-linux machines, set the flag to 1 by default - ADX_SUPPORT := 1 -endif +include crypto_adx_flag.mk -# the crypto package uses BLST source files underneath which may use ADX insructions. -ifeq ($(ADX_SUPPORT), 1) -# if ADX insructions are supported, default is to use a fast ADX BLST implementation - CRYPTO_FLAG := "" -else -# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation - CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" -endif CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) cmd/collection/collection: diff --git a/crypto_adx_flag.mk b/crypto_adx_flag.mk new file mode 100644 index 00000000000..22c405ab45d --- /dev/null +++ b/crypto_adx_flag.mk @@ -0,0 +1,17 @@ +# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. +ifeq ($(shell uname -s),Linux) +# detect ADX support on the CURRENT linux machine. + ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +else +# on non-linux machines, set the flag to 1 by default + ADX_SUPPORT := 1 +endif + +# the crypto package uses BLST source files underneath which may use ADX insructions. +ifeq ($(ADX_SUPPORT), 1) +# if ADX insructions are supported, default is to use a fast ADX BLST implementation + CRYPTO_FLAG := "" +else +# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation + CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" +endif \ No newline at end of file diff --git a/insecure/Makefile b/insecure/Makefile index fd6fdae0dd9..d1dc33fa216 100644 --- a/insecure/Makefile +++ b/insecure/Makefile @@ -8,23 +8,8 @@ else RACE_FLAG := endif -# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. -ifeq ($(shell uname -s),Linux) -# detect ADX support on the CURRENT linux machine. - ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) -else -# on non-linux machines, set the flag to 1 by default - ADX_SUPPORT := 1 -endif +include ../crypto_adx_flag.mk -# the crypto package uses BLST source files underneath which may use ADX insructions. -ifeq ($(ADX_SUPPORT), 1) -# if ADX insructions are supported, default is to use a fast ADX BLST implementation - CRYPTO_FLAG := "" -else -# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation - CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" -endif CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) # runs all unit tests of the insecure module diff --git a/integration/Makefile b/integration/Makefile index 963b7093511..b29e5bcf873 100644 --- a/integration/Makefile +++ b/integration/Makefile @@ -8,23 +8,8 @@ else RACE_FLAG := endif -# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. -ifeq ($(shell uname -s),Linux) -# detect ADX support on the CURRENT linux machine. - ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) -else -# on non-linux machines, set the flag to 1 by default - ADX_SUPPORT := 1 -endif +include ../crypto_adx_flag.mk -# the crypto package uses BLST source files underneath which may use ADX insructions. -ifeq ($(ADX_SUPPORT), 1) -# if ADX insructions are supported, default is to use a fast ADX BLST implementation - CRYPTO_FLAG := "" -else -# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation - CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" -endif CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) # Run the integration test suite From 6c34ae3fccb9a23887c95d790e141a7c7000ed3f Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 20 Oct 2023 14:01:32 -0500 Subject: [PATCH 191/200] add noop target for empty ci operations --- .github/workflows/ci.yml | 2 +- .github/workflows/flaky-test-monitor.yml | 2 +- Makefile | 5 +++++ crypto/blst_src/README.md | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b24de2f44ca..9a88caa0e93 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,7 +135,7 @@ jobs: matrix: include: - name: crypto - setup: + setup: noop retries: 1 race: 1 - name: insecure diff --git a/.github/workflows/flaky-test-monitor.yml b/.github/workflows/flaky-test-monitor.yml index aa9d99dd65b..836af9c9228 100644 --- a/.github/workflows/flaky-test-monitor.yml +++ b/.github/workflows/flaky-test-monitor.yml @@ -83,7 +83,7 @@ jobs: matrix: include: - name: crypto - setup: + setup: noop race: 1 test_category: unit-crypto - name: insecure diff --git a/Makefile b/Makefile index 18415b12a16..3dd74fac0af 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,11 @@ include crypto_adx_flag.mk CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) +# needed for CI +.PHONY: noop +noop: + @echo "This is a no-op target" + cmd/collection/collection: $(CGO_FLAG) go build -o cmd/collection/collection cmd/collection/main.go diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md index 50ca45ea7d6..c2e89a1de71 100644 --- a/crypto/blst_src/README.md +++ b/crypto/blst_src/README.md @@ -28,4 +28,4 @@ To upgrade the BLST version: - [ ] solve all breaking changes that may occur. - [ ] update the commit version on this `./blst_src/README`. -Note that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should done along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. +Note that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should be done along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. From b180269bb1f52da3d60bb6e4a1114262f9807860 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 20 Oct 2023 14:05:57 -0500 Subject: [PATCH 192/200] cleaning and typos --- Makefile | 5 ----- crypto/Makefile | 11 +++-------- crypto_adx_flag.mk | 6 +++--- integration/Makefile | 6 +----- 4 files changed, 7 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index 3dd74fac0af..76e23178cb5 100644 --- a/Makefile +++ b/Makefile @@ -58,9 +58,6 @@ cmd/util/util: update-core-contracts-version: ./scripts/update-core-contracts.sh $(CC_VERSION) -############################################################################################ -# CAUTION: DO NOT MODIFY THESE TARGETS! DOING SO WILL BREAK THE FLAKY TEST MONITOR - .PHONY: unittest-main unittest-main: # test all packages @@ -84,8 +81,6 @@ install-tools: check-go-version install-mock-generators verify-mocks: tidy generate-mocks git diff --exit-code -############################################################################################ - .SILENT: go-math-rand-check go-math-rand-check: # check that the insecure math/rand Go package isn't used by production code. diff --git a/crypto/Makefile b/crypto/Makefile index 43aae8ef39f..14016e40619 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -22,12 +22,12 @@ else ADX_SUPPORT := 1 endif -# the crypto package uses BLST source files underneath which may use ADX insructions. +# the crypto package uses BLST source files underneath which may use ADX instructions. ifeq ($(ADX_SUPPORT), 1) -# if ADX insructions are supported, default is to use a fast ADX BLST implementation +# if ADX instructions are supported, default is to use a fast ADX BLST implementation CRYPTO_FLAG := "" else -# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation +# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" endif CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) @@ -75,8 +75,6 @@ c-sanitize: c-asan # - address sanitization and other checks (only on linux) # - memory sanitization (target m-san) is disabled because of multiple false positives - - # Go tidy .PHONY: go-tidy go-tidy: @@ -90,9 +88,6 @@ lint: go-tidy # revive -config revive.toml golangci-lint run -v ./... - - - # test all packages .PHONY: test test: diff --git a/crypto_adx_flag.mk b/crypto_adx_flag.mk index 22c405ab45d..667c8d493d3 100644 --- a/crypto_adx_flag.mk +++ b/crypto_adx_flag.mk @@ -7,11 +7,11 @@ else ADX_SUPPORT := 1 endif -# the crypto package uses BLST source files underneath which may use ADX insructions. +# the crypto package uses BLST source files underneath which may use ADX instructions. ifeq ($(ADX_SUPPORT), 1) -# if ADX insructions are supported, default is to use a fast ADX BLST implementation +# if ADX instructions are supported, default is to use a fast ADX BLST implementation CRYPTO_FLAG := "" else -# if ADX insructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation +# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" endif \ No newline at end of file diff --git a/integration/Makefile b/integration/Makefile index b29e5bcf873..1e73769a50f 100644 --- a/integration/Makefile +++ b/integration/Makefile @@ -19,10 +19,6 @@ integration-test: access-tests ghost-tests mvp-tests execution-tests verificatio .PHONY: ci-integration-test ci-integration-test: access-tests ghost-tests mvp-tests epochs-cohort1-tests epochs-cohort2-tests consensus-tests execution-tests verification-tests upgrades-tests network-tests collection-tests -############################################################################################ -# CAUTION: DO NOT MODIFY THE TARGETS BELOW! DOING SO WILL BREAK THE FLAKY TEST MONITOR -# In particular, do not skip tests by commenting them out here. - # Run unit tests for test utilities in this module .PHONY: test test: @@ -88,4 +84,4 @@ bft-gossipsub-tests: .PHONY: bft-tests bft-tests: bft-framework-tests bft-protocol-tests bft-gossipsub-tests -############################################################################################ + From 7ad6a7a5e93154fb051a98a5f704a3027ddefc57 Mon Sep 17 00:00:00 2001 From: Jordan Schalm Date: Mon, 23 Oct 2023 09:38:28 -0700 Subject: [PATCH 193/200] make tidy --- go.sum | 9 +-------- insecure/go.sum | 6 +----- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/go.sum b/go.sum index 8feff29055b..3dfa262c25d 100644 --- a/go.sum +++ b/go.sum @@ -1323,13 +1323,8 @@ github.com/onflow/flow-core-contracts/lib/go/templates v0.14.0/go.mod h1:ZeLxwaB github.com/onflow/flow-ft/lib/go/contracts v0.7.1-0.20230711213910-baad011d2b13 h1:B4ll7e3j+MqTJv2122Enq3RtDNzmIGRu9xjV7fo7un0= github.com/onflow/flow-ft/lib/go/contracts v0.7.1-0.20230711213910-baad011d2b13/go.mod h1:kTMFIySzEJJeupk+7EmXs0EJ6CBWY/MV9fv9iYQk+RU= github.com/onflow/flow-go-sdk v0.24.0/go.mod h1:IoptMLPyFXWvyd9yYA6/4EmSeeozl6nJoIv4FaEMg74= -github.com/onflow/flow-go-sdk v0.41.10 h1:Cio6GJhtx532TUY+cqrqWglD5sZCXkWeM5QvaRha3p4= -github.com/onflow/flow-go-sdk v0.41.10/go.mod h1:0a0LiQFbFt8RW/ptoMUU7YkvW9ArVcbjLE0XS78uz1E= github.com/onflow/flow-go-sdk v0.41.9 h1:cyplhhhc0RnfOAan2t7I/7C9g1hVGDDLUhWj6ZHAkk4= github.com/onflow/flow-go-sdk v0.41.9/go.mod h1:e9Q5TITCy7g08lkdQJxP8fAKBnBoC5FjALvUKr36j4I= -github.com/onflow/flow-go/crypto v0.21.3/go.mod h1:vI6V4CY3R6c4JKBxdcRiR/AnjBfL8OSD97bJc60cLuQ= -github.com/onflow/flow-go/crypto v0.24.9 h1:0EQp+kSZYJepMIiSypfJVe7tzsPcb6UXOdOtsTCDhBs= -github.com/onflow/flow-go/crypto v0.24.9/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0 h1:rhUDeD27jhLwOqQKI/23008CYfnqXErrJvc4EFRP2a0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0/go.mod h1:YsvzYng4htDgRB9sa9jxdwoTuuhjK8WYWXTyLkIigZY= github.com/onflow/flow/protobuf/go/flow v0.2.2/go.mod h1:gQxYqCfkI8lpnKsmIjwtN2mV/N2PIwc1I+RUK4HPIc8= @@ -1775,10 +1770,8 @@ golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5 golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= -golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA= golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= diff --git a/insecure/go.sum b/insecure/go.sum index a3258e00f67..d918d8fc6c0 100644 --- a/insecure/go.sum +++ b/insecure/go.sum @@ -1744,10 +1744,8 @@ golang.org/x/crypto v0.0.0-20210513164829-c07d793c2f9a/go.mod h1:P+XmwS30IXTQdn5 golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM= -golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I= golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA= golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1856,8 +1854,6 @@ golang.org/x/net v0.0.0-20210423184538-5f58ad60dda6/go.mod h1:OJAsFXCWl8Ukc7SiCT golang.org/x/net v0.0.0-20210503060351-7fd8e65b6420/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= -golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50= golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= From f45e54690754564a4dfaa64548650cf52b9fdfda Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Mon, 23 Oct 2023 16:58:29 -0500 Subject: [PATCH 194/200] fix overwritten test settings --- .../tests/epochs/cohort2/epoch_join_and_leave_vn_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go b/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go index 65569dacd08..a6612ced27c 100644 --- a/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go +++ b/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go @@ -28,7 +28,7 @@ func (s *EpochJoinAndLeaveVNSuite) SetupTest() { s.DKGPhaseLen = 100 s.EpochLen = 450 s.EpochCommitSafetyThreshold = 20 - s.DynamicEpochTransitionSuite.SetupTest() + s.Suite.SetupTest() } // TestEpochJoinAndLeaveVN should update verification nodes and assert healthy network conditions From 99b1237305d2240fe86aaee700d64ca0ff829fc6 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 14 Nov 2023 20:14:19 -0500 Subject: [PATCH 195/200] remove crypto setup --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5a9e450c264..9b7e7b8fdaf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -190,8 +190,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Docker build run: make docker-build-flow docker-build-flow-corrupt - name: Save Docker images From fbfecc1d0a1d321c67a6a79dea2bc841c57a9788 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 21 Nov 2023 17:17:21 -0500 Subject: [PATCH 196/200] update graceful stop duration to 1s in integration upgrade test --- integration/tests/upgrades/suite.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integration/tests/upgrades/suite.go b/integration/tests/upgrades/suite.go index dbc40e810aa..93094b8c13b 100644 --- a/integration/tests/upgrades/suite.go +++ b/integration/tests/upgrades/suite.go @@ -83,10 +83,12 @@ func (s *Suite) SetupTest() { testnet.WithLogLevel(zerolog.WarnLevel), testnet.WithID(s.exe1ID), testnet.WithAdditionalFlag("--extensive-logging=true"), + testnet.WithAdditionalFlag("--max-graceful-stop-duration=1s"), ), testnet.NewNodeConfig( flow.RoleExecution, testnet.WithLogLevel(zerolog.WarnLevel), + testnet.WithAdditionalFlag("--max-graceful-stop-duration=1s"), ), testnet.NewNodeConfig(flow.RoleConsensus, consensusConfigs...), testnet.NewNodeConfig(flow.RoleConsensus, consensusConfigs...), From cd2d74e8b6b0d40b6805156943d678bb8db15f72 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Fri, 24 Nov 2023 16:56:40 -0500 Subject: [PATCH 197/200] slow down block rate in integration access cohort1 test --- integration/tests/access/cohort1/access_api_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/integration/tests/access/cohort1/access_api_test.go b/integration/tests/access/cohort1/access_api_test.go index cb5a175130d..24409f84ad2 100644 --- a/integration/tests/access/cohort1/access_api_test.go +++ b/integration/tests/access/cohort1/access_api_test.go @@ -87,7 +87,12 @@ func (s *AccessAPISuite) SetupTest() { ) consensusConfigs := []func(config *testnet.NodeConfig){ - testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=100ms"), + // `cruise-ctl-fallback-proposal-duration` is set to 250ms instead to of 100ms + // to purposely slow down the block rate. This is needed since the crypto module + // update providing faster BLS operations. + // TODO: fix the access integration test logic to function without slowing down + // the block rate + testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=250ms"), testnet.WithAdditionalFlagf("--required-verification-seal-approvals=%d", 1), testnet.WithAdditionalFlagf("--required-construction-seal-approvals=%d", 1), testnet.WithLogLevel(zerolog.FatalLevel), From 769ad6727c2bc2e4963cb850e78f643286218030 Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 5 Dec 2023 16:14:48 -0600 Subject: [PATCH 198/200] slow down block production in bft tests --- integration/tests/bft/base_suite.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/integration/tests/bft/base_suite.go b/integration/tests/bft/base_suite.go index b50085a9e50..2e6e74de881 100644 --- a/integration/tests/bft/base_suite.go +++ b/integration/tests/bft/base_suite.go @@ -77,7 +77,12 @@ func (b *BaseSuite) SetupSuite() { testnet.WithLogLevel(zerolog.FatalLevel), testnet.WithAdditionalFlag("--required-verification-seal-approvals=1"), testnet.WithAdditionalFlag("--required-construction-seal-approvals=1"), - testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=1ms"), + // `cruise-ctl-fallback-proposal-duration` is set to 250ms instead to of 1ms + // to purposely slow down the block rate. This is needed since the crypto module + // update providing faster BLS operations. + // TODO: fix the access integration test logic to function without slowing down + // the block rate + testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=250ms"), ) b.NodeConfigs = append(b.NodeConfigs, nodeConfig) } From f7dac6cff619f3f41bdd1679205ff50aa378d80a Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 5 Dec 2023 16:42:06 -0600 Subject: [PATCH 199/200] slow down block rate in SN test --- .../epochs/cohort2/epoch_join_and_leave_sn_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go index fb825e447a6..2073e693988 100644 --- a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go +++ b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go @@ -2,6 +2,7 @@ package cohort2 import ( "testing" + "time" "github.com/stretchr/testify/suite" @@ -17,6 +18,15 @@ type EpochJoinAndLeaveSNSuite struct { epochs.DynamicEpochTransitionSuite } +func (s *EpochJoinAndLeaveSNSuite) SetupTest() { + // slow down the block rate. This is needed since the crypto module + // update provides faster BLS operations. + // TODO: fix the access integration test logic to function without slowing down + // the block rate + s.ConsensusProposalDuration = time.Millisecond * 250 + s.Suite.SetupTest() +} + // TestEpochJoinAndLeaveSN should update consensus nodes and assert healthy network conditions // after the epoch transition completes. See health check function for details. func (s *EpochJoinAndLeaveSNSuite) TestEpochJoinAndLeaveSN() { From e3e29f049b1bb3409d9d5dfc469a4e12c93f75fe Mon Sep 17 00:00:00 2001 From: Tarak Ben Youssef Date: Tue, 5 Dec 2023 17:15:37 -0600 Subject: [PATCH 200/200] fix SN integration test bug --- .../tests/epochs/cohort2/epoch_join_and_leave_sn_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go index 2073e693988..d101af6371d 100644 --- a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go +++ b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go @@ -24,7 +24,7 @@ func (s *EpochJoinAndLeaveSNSuite) SetupTest() { // TODO: fix the access integration test logic to function without slowing down // the block rate s.ConsensusProposalDuration = time.Millisecond * 250 - s.Suite.SetupTest() + s.DynamicEpochTransitionSuite.SetupTest() } // TestEpochJoinAndLeaveSN should update consensus nodes and assert healthy network conditions